Files
zulip/zerver/lib/url_preview/parsers/base.py
Anders Kaseorg bf45f921a7 url_preview: Allow Beautiful Soup to get the charset from <meta>.
An HTML document sent without a charset in the Content-Type header
needs to be scanned for a charset in <meta> tags.  We need to pass
bytes instead of str to Beautiful Soup to allow it to do this.

Fixes #16843.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
2020-12-15 11:30:57 -08:00

18 lines
667 B
Python

import cgi
from typing import Any, Optional
class BaseParser:
def __init__(self, html_source: bytes, content_type: Optional[str]) -> None:
# We import BeautifulSoup here, because it's not used by most
# processes in production, and bs4 is big enough that
# importing it adds 10s of milliseconds to manage.py startup.
from bs4 import BeautifulSoup
charset = None
if content_type is not None:
charset = cgi.parse_header(content_type)[1].get("charset")
self._soup = BeautifulSoup(html_source, "lxml", from_encoding=charset)
def extract_data(self) -> Any:
raise NotImplementedError()