mirror of
				https://github.com/zulip/zulip.git
				synced 2025-11-03 21:43:21 +00:00 
			
		
		
		
	This module is used to render the HTML of pages like our user documentation into text for use in open graph previews of those articles. It provided somewhat confusing output in the case that there were paragraph breaks in the original message, because text with multiple paragraphs and list items does't read very well. This commit adds `|` as a delimiter between paragraphs, and prefixes list items with a `*`. Closes #12228
		
			
				
	
	
		
			41 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			41 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from typing import Dict, Optional
 | 
						|
 | 
						|
from bs4 import BeautifulSoup
 | 
						|
from django.http import HttpRequest
 | 
						|
from django.utils.html import escape
 | 
						|
 | 
						|
from zerver.lib.cache import cache_with_key, open_graph_description_cache_key
 | 
						|
 | 
						|
def html_to_text(content: str, tags: Optional[Dict[str, str]]=None) -> str:
 | 
						|
    bs = BeautifulSoup(content, features='lxml')
 | 
						|
    # Skip any admonition (warning) blocks, since they're
 | 
						|
    # usually something about users needing to be an
 | 
						|
    # organization administrator, and not useful for
 | 
						|
    # describing the page.
 | 
						|
    for tag in bs.find_all('div', class_="admonition"):
 | 
						|
        tag.clear()
 | 
						|
 | 
						|
    # Skip code-sections, which just contains navigation instructions.
 | 
						|
    for tag in bs.find_all('div', class_="code-section"):
 | 
						|
        tag.clear()
 | 
						|
 | 
						|
    text = ''
 | 
						|
    if tags is None:
 | 
						|
        tags = {'p': ' | '}
 | 
						|
    for element in bs.find_all(tags.keys()):
 | 
						|
        # Ignore empty elements
 | 
						|
        if not element.text:
 | 
						|
            continue
 | 
						|
        # .text converts it from HTML to text
 | 
						|
        if text:
 | 
						|
            text += tags[element.name]
 | 
						|
        text += element.text
 | 
						|
        if len(text) > 500:
 | 
						|
            break
 | 
						|
    return escape(' '.join(text.split()))
 | 
						|
 | 
						|
@cache_with_key(open_graph_description_cache_key, timeout=3600*24)
 | 
						|
def get_content_description(content: bytes, request: HttpRequest) -> str:
 | 
						|
    str_content = content.decode("utf-8")
 | 
						|
    return html_to_text(str_content)
 |