Files
zulip/zerver/lib/html_to_text.py
Anders Kaseorg 91a86c24f5 python: Replace None defaults with empty collections where appropriate.
Use read-only types (List ↦ Sequence, Dict ↦ Mapping, Set ↦
AbstractSet) to guard against accidental mutation of the default
value.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
2020-06-13 15:31:27 -07:00

40 lines
1.3 KiB
Python

from typing import Mapping
from bs4 import BeautifulSoup
from django.http import HttpRequest
from django.utils.html import escape
from zerver.lib.cache import cache_with_key, open_graph_description_cache_key
def html_to_text(content: str, tags: Mapping[str, str] = {'p': ' | '}) -> str:
bs = BeautifulSoup(content, features='lxml')
# Skip any admonition (warning) blocks, since they're
# usually something about users needing to be an
# organization administrator, and not useful for
# describing the page.
for tag in bs.find_all('div', class_="admonition"):
tag.clear()
# Skip code-sections, which just contains navigation instructions.
for tag in bs.find_all('div', class_="code-section"):
tag.clear()
text = ''
for element in bs.find_all(tags.keys()):
# Ignore empty elements
if not element.text:
continue
# .text converts it from HTML to text
if text:
text += tags[element.name]
text += element.text
if len(text) > 500:
break
return escape(' '.join(text.split()))
@cache_with_key(open_graph_description_cache_key, timeout=3600*24)
def get_content_description(content: bytes, request: HttpRequest) -> str:
str_content = content.decode("utf-8")
return html_to_text(str_content)