Files
zulip/zerver/lib/html_to_text.py
Puneeth Chaganti bfc3e3c0c7 html_to_text: Add delimiters between text from different elements.
This module is used to render the HTML of pages like our user documentation 
into text for use in open graph previews of those articles.  It provided somewhat
confusing output in the case that there were paragraph breaks in the original message,
because text with multiple paragraphs and list items does't read very well. This commit
adds `|` as a delimiter between paragraphs, and prefixes list items with a `*`.

Closes #12228
2019-05-01 17:35:20 -07:00

41 lines
1.4 KiB
Python

from typing import Dict, Optional
from bs4 import BeautifulSoup
from django.http import HttpRequest
from django.utils.html import escape
from zerver.lib.cache import cache_with_key, open_graph_description_cache_key
def html_to_text(content: str, tags: Optional[Dict[str, str]]=None) -> str:
bs = BeautifulSoup(content, features='lxml')
# Skip any admonition (warning) blocks, since they're
# usually something about users needing to be an
# organization administrator, and not useful for
# describing the page.
for tag in bs.find_all('div', class_="admonition"):
tag.clear()
# Skip code-sections, which just contains navigation instructions.
for tag in bs.find_all('div', class_="code-section"):
tag.clear()
text = ''
if tags is None:
tags = {'p': ' | '}
for element in bs.find_all(tags.keys()):
# Ignore empty elements
if not element.text:
continue
# .text converts it from HTML to text
if text:
text += tags[element.name]
text += element.text
if len(text) > 500:
break
return escape(' '.join(text.split()))
@cache_with_key(open_graph_description_cache_key, timeout=3600*24)
def get_content_description(content: bytes, request: HttpRequest) -> str:
str_content = content.decode("utf-8")
return html_to_text(str_content)