url_preview: Allow Beautiful Soup to get the charset from <meta>.

An HTML document sent without a charset in the Content-Type header
needs to be scanned for a charset in <meta> tags.  We need to pass
bytes instead of str to Beautiful Soup to allow it to do this.

Fixes #16843.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
Anders Kaseorg
2020-12-07 19:26:30 -08:00
committed by Tim Abbott
parent daac7536f3
commit bf45f921a7
4 changed files with 49 additions and 19 deletions

View File

@@ -314,6 +314,7 @@ class HostRequestMock:
class MockPythonResponse: class MockPythonResponse:
def __init__(self, text: str, status_code: int, headers: Optional[Dict[str, str]]=None) -> None: def __init__(self, text: str, status_code: int, headers: Optional[Dict[str, str]]=None) -> None:
self.content = text.encode()
self.text = text self.text = text
self.status_code = status_code self.status_code = status_code
if headers is None: if headers is None:

View File

@@ -1,13 +1,17 @@
from typing import Any import cgi
from typing import Any, Optional
class BaseParser: class BaseParser:
def __init__(self, html_source: str) -> None: def __init__(self, html_source: bytes, content_type: Optional[str]) -> None:
# We import BeautifulSoup here, because it's not used by most # We import BeautifulSoup here, because it's not used by most
# processes in production, and bs4 is big enough that # processes in production, and bs4 is big enough that
# importing it adds 10s of milliseconds to manage.py startup. # importing it adds 10s of milliseconds to manage.py startup.
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
self._soup = BeautifulSoup(html_source, "lxml") charset = None
if content_type is not None:
charset = cgi.parse_header(content_type)[1].get("charset")
self._soup = BeautifulSoup(html_source, "lxml", from_encoding=charset)
def extract_data(self) -> Any: def extract_data(self) -> Any:
raise NotImplementedError() raise NotImplementedError()

View File

@@ -91,12 +91,16 @@ def get_link_embed_data(url: str,
response = requests.get(mark_sanitized(url), stream=True, headers=HEADERS, timeout=TIMEOUT) response = requests.get(mark_sanitized(url), stream=True, headers=HEADERS, timeout=TIMEOUT)
if response.ok: if response.ok:
og_data = OpenGraphParser(response.text).extract_data() og_data = OpenGraphParser(
response.content, response.headers.get("Content-Type")
).extract_data()
for key in ['title', 'description', 'image']: for key in ['title', 'description', 'image']:
if not data.get(key) and og_data.get(key): if not data.get(key) and og_data.get(key):
data[key] = og_data[key] data[key] = og_data[key]
generic_data = GenericParser(response.text).extract_data() or {} generic_data = GenericParser(
response.content, response.headers.get("Content-Type")
).extract_data() or {}
for key in ['title', 'description', 'image']: for key in ['title', 'description', 'image']:
if not data.get(key) and generic_data.get(key): if not data.get(key) and generic_data.get(key):
data[key] = generic_data[key] data[key] = generic_data[key]

View File

@@ -136,7 +136,7 @@ class OembedTestCase(ZulipTestCase):
class OpenGraphParserTestCase(ZulipTestCase): class OpenGraphParserTestCase(ZulipTestCase):
def test_page_with_og(self) -> None: def test_page_with_og(self) -> None:
html = """<html> html = b"""<html>
<head> <head>
<meta property="og:title" content="The Rock" /> <meta property="og:title" content="The Rock" />
<meta property="og:type" content="video.movie" /> <meta property="og:type" content="video.movie" />
@@ -146,14 +146,14 @@ class OpenGraphParserTestCase(ZulipTestCase):
</head> </head>
</html>""" </html>"""
parser = OpenGraphParser(html) parser = OpenGraphParser(html, "text/html; charset=UTF-8")
result = parser.extract_data() result = parser.extract_data()
self.assertIn('title', result) self.assertIn('title', result)
self.assertEqual(result['title'], 'The Rock') self.assertEqual(result['title'], 'The Rock')
self.assertEqual(result.get('description'), 'The Rock film') self.assertEqual(result.get('description'), 'The Rock film')
def test_page_with_evil_og_tags(self) -> None: def test_page_with_evil_og_tags(self) -> None:
html = """<html> html = b"""<html>
<head> <head>
<meta property="og:title" content="The Rock" /> <meta property="og:title" content="The Rock" />
<meta property="og:type" content="video.movie" /> <meta property="og:type" content="video.movie" />
@@ -165,7 +165,7 @@ class OpenGraphParserTestCase(ZulipTestCase):
</head> </head>
</html>""" </html>"""
parser = OpenGraphParser(html) parser = OpenGraphParser(html, "text/html; charset=UTF-8")
result = parser.extract_data() result = parser.extract_data()
self.assertIn('title', result) self.assertIn('title', result)
self.assertEqual(result['title'], 'The Rock') self.assertEqual(result['title'], 'The Rock')
@@ -173,9 +173,30 @@ class OpenGraphParserTestCase(ZulipTestCase):
self.assertEqual(result.get('oembed'), None) self.assertEqual(result.get('oembed'), None)
self.assertEqual(result.get('html'), None) self.assertEqual(result.get('html'), None)
def test_charset_in_header(self) -> None:
html = """<html>
<head>
<meta property="og:title" content="中文" />
</head>
</html>""".encode("big5")
parser = OpenGraphParser(html, "text/html; charset=Big5")
result = parser.extract_data()
self.assertEqual(result["title"], "中文")
def test_charset_in_meta(self) -> None:
html = """<html>
<head>
<meta content-type="text/html; charset=Big5" />
<meta property="og:title" content="中文" />
</head>
</html>""".encode("big5")
parser = OpenGraphParser(html, "text/html")
result = parser.extract_data()
self.assertEqual(result["title"], "中文")
class GenericParserTestCase(ZulipTestCase): class GenericParserTestCase(ZulipTestCase):
def test_parser(self) -> None: def test_parser(self) -> None:
html = """ html = b"""
<html> <html>
<head><title>Test title</title></head> <head><title>Test title</title></head>
<body> <body>
@@ -184,13 +205,13 @@ class GenericParserTestCase(ZulipTestCase):
</body> </body>
</html> </html>
""" """
parser = GenericParser(html) parser = GenericParser(html, "text/html; charset=UTF-8")
result = parser.extract_data() result = parser.extract_data()
self.assertEqual(result.get('title'), 'Test title') self.assertEqual(result.get('title'), 'Test title')
self.assertEqual(result.get('description'), 'Description text') self.assertEqual(result.get('description'), 'Description text')
def test_extract_image(self) -> None: def test_extract_image(self) -> None:
html = """ html = b"""
<html> <html>
<body> <body>
<h1>Main header</h1> <h1>Main header</h1>
@@ -202,14 +223,14 @@ class GenericParserTestCase(ZulipTestCase):
</body> </body>
</html> </html>
""" """
parser = GenericParser(html) parser = GenericParser(html, "text/html; charset=UTF-8")
result = parser.extract_data() result = parser.extract_data()
self.assertEqual(result.get('title'), 'Main header') self.assertEqual(result.get('title'), 'Main header')
self.assertEqual(result.get('description'), 'Description text') self.assertEqual(result.get('description'), 'Description text')
self.assertEqual(result.get('image'), 'http://test.com/test.jpg') self.assertEqual(result.get('image'), 'http://test.com/test.jpg')
def test_extract_description(self) -> None: def test_extract_description(self) -> None:
html = """ html = b"""
<html> <html>
<body> <body>
<div> <div>
@@ -220,22 +241,22 @@ class GenericParserTestCase(ZulipTestCase):
</body> </body>
</html> </html>
""" """
parser = GenericParser(html) parser = GenericParser(html, "text/html; charset=UTF-8")
result = parser.extract_data() result = parser.extract_data()
self.assertEqual(result.get('description'), 'Description text') self.assertEqual(result.get('description'), 'Description text')
html = """ html = b"""
<html> <html>
<head><meta name="description" content="description 123"</head> <head><meta name="description" content="description 123"</head>
<body></body> <body></body>
</html> </html>
""" """
parser = GenericParser(html) parser = GenericParser(html, "text/html; charset=UTF-8")
result = parser.extract_data() result = parser.extract_data()
self.assertEqual(result.get('description'), 'description 123') self.assertEqual(result.get('description'), 'description 123')
html = "<html><body></body></html>" html = b"<html><body></body></html>"
parser = GenericParser(html) parser = GenericParser(html, "text/html; charset=UTF-8")
result = parser.extract_data() result = parser.extract_data()
self.assertIsNone(result.get('description')) self.assertIsNone(result.get('description'))