mirror of
https://github.com/zulip/zulip.git
synced 2025-11-10 00:46:03 +00:00
url_preview: Allow Beautiful Soup to get the charset from <meta>.
An HTML document sent without a charset in the Content-Type header needs to be scanned for a charset in <meta> tags. We need to pass bytes instead of str to Beautiful Soup to allow it to do this. Fixes #16843. Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
committed by
Tim Abbott
parent
daac7536f3
commit
bf45f921a7
@@ -314,6 +314,7 @@ class HostRequestMock:
|
||||
|
||||
class MockPythonResponse:
|
||||
def __init__(self, text: str, status_code: int, headers: Optional[Dict[str, str]]=None) -> None:
|
||||
self.content = text.encode()
|
||||
self.text = text
|
||||
self.status_code = status_code
|
||||
if headers is None:
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
from typing import Any
|
||||
import cgi
|
||||
from typing import Any, Optional
|
||||
|
||||
|
||||
class BaseParser:
|
||||
def __init__(self, html_source: str) -> None:
|
||||
def __init__(self, html_source: bytes, content_type: Optional[str]) -> None:
|
||||
# We import BeautifulSoup here, because it's not used by most
|
||||
# processes in production, and bs4 is big enough that
|
||||
# importing it adds 10s of milliseconds to manage.py startup.
|
||||
from bs4 import BeautifulSoup
|
||||
self._soup = BeautifulSoup(html_source, "lxml")
|
||||
charset = None
|
||||
if content_type is not None:
|
||||
charset = cgi.parse_header(content_type)[1].get("charset")
|
||||
self._soup = BeautifulSoup(html_source, "lxml", from_encoding=charset)
|
||||
|
||||
def extract_data(self) -> Any:
|
||||
raise NotImplementedError()
|
||||
|
||||
@@ -91,12 +91,16 @@ def get_link_embed_data(url: str,
|
||||
|
||||
response = requests.get(mark_sanitized(url), stream=True, headers=HEADERS, timeout=TIMEOUT)
|
||||
if response.ok:
|
||||
og_data = OpenGraphParser(response.text).extract_data()
|
||||
og_data = OpenGraphParser(
|
||||
response.content, response.headers.get("Content-Type")
|
||||
).extract_data()
|
||||
for key in ['title', 'description', 'image']:
|
||||
if not data.get(key) and og_data.get(key):
|
||||
data[key] = og_data[key]
|
||||
|
||||
generic_data = GenericParser(response.text).extract_data() or {}
|
||||
generic_data = GenericParser(
|
||||
response.content, response.headers.get("Content-Type")
|
||||
).extract_data() or {}
|
||||
for key in ['title', 'description', 'image']:
|
||||
if not data.get(key) and generic_data.get(key):
|
||||
data[key] = generic_data[key]
|
||||
|
||||
@@ -136,7 +136,7 @@ class OembedTestCase(ZulipTestCase):
|
||||
|
||||
class OpenGraphParserTestCase(ZulipTestCase):
|
||||
def test_page_with_og(self) -> None:
|
||||
html = """<html>
|
||||
html = b"""<html>
|
||||
<head>
|
||||
<meta property="og:title" content="The Rock" />
|
||||
<meta property="og:type" content="video.movie" />
|
||||
@@ -146,14 +146,14 @@ class OpenGraphParserTestCase(ZulipTestCase):
|
||||
</head>
|
||||
</html>"""
|
||||
|
||||
parser = OpenGraphParser(html)
|
||||
parser = OpenGraphParser(html, "text/html; charset=UTF-8")
|
||||
result = parser.extract_data()
|
||||
self.assertIn('title', result)
|
||||
self.assertEqual(result['title'], 'The Rock')
|
||||
self.assertEqual(result.get('description'), 'The Rock film')
|
||||
|
||||
def test_page_with_evil_og_tags(self) -> None:
|
||||
html = """<html>
|
||||
html = b"""<html>
|
||||
<head>
|
||||
<meta property="og:title" content="The Rock" />
|
||||
<meta property="og:type" content="video.movie" />
|
||||
@@ -165,7 +165,7 @@ class OpenGraphParserTestCase(ZulipTestCase):
|
||||
</head>
|
||||
</html>"""
|
||||
|
||||
parser = OpenGraphParser(html)
|
||||
parser = OpenGraphParser(html, "text/html; charset=UTF-8")
|
||||
result = parser.extract_data()
|
||||
self.assertIn('title', result)
|
||||
self.assertEqual(result['title'], 'The Rock')
|
||||
@@ -173,9 +173,30 @@ class OpenGraphParserTestCase(ZulipTestCase):
|
||||
self.assertEqual(result.get('oembed'), None)
|
||||
self.assertEqual(result.get('html'), None)
|
||||
|
||||
def test_charset_in_header(self) -> None:
|
||||
html = """<html>
|
||||
<head>
|
||||
<meta property="og:title" content="中文" />
|
||||
</head>
|
||||
</html>""".encode("big5")
|
||||
parser = OpenGraphParser(html, "text/html; charset=Big5")
|
||||
result = parser.extract_data()
|
||||
self.assertEqual(result["title"], "中文")
|
||||
|
||||
def test_charset_in_meta(self) -> None:
|
||||
html = """<html>
|
||||
<head>
|
||||
<meta content-type="text/html; charset=Big5" />
|
||||
<meta property="og:title" content="中文" />
|
||||
</head>
|
||||
</html>""".encode("big5")
|
||||
parser = OpenGraphParser(html, "text/html")
|
||||
result = parser.extract_data()
|
||||
self.assertEqual(result["title"], "中文")
|
||||
|
||||
class GenericParserTestCase(ZulipTestCase):
|
||||
def test_parser(self) -> None:
|
||||
html = """
|
||||
html = b"""
|
||||
<html>
|
||||
<head><title>Test title</title></head>
|
||||
<body>
|
||||
@@ -184,13 +205,13 @@ class GenericParserTestCase(ZulipTestCase):
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
parser = GenericParser(html)
|
||||
parser = GenericParser(html, "text/html; charset=UTF-8")
|
||||
result = parser.extract_data()
|
||||
self.assertEqual(result.get('title'), 'Test title')
|
||||
self.assertEqual(result.get('description'), 'Description text')
|
||||
|
||||
def test_extract_image(self) -> None:
|
||||
html = """
|
||||
html = b"""
|
||||
<html>
|
||||
<body>
|
||||
<h1>Main header</h1>
|
||||
@@ -202,14 +223,14 @@ class GenericParserTestCase(ZulipTestCase):
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
parser = GenericParser(html)
|
||||
parser = GenericParser(html, "text/html; charset=UTF-8")
|
||||
result = parser.extract_data()
|
||||
self.assertEqual(result.get('title'), 'Main header')
|
||||
self.assertEqual(result.get('description'), 'Description text')
|
||||
self.assertEqual(result.get('image'), 'http://test.com/test.jpg')
|
||||
|
||||
def test_extract_description(self) -> None:
|
||||
html = """
|
||||
html = b"""
|
||||
<html>
|
||||
<body>
|
||||
<div>
|
||||
@@ -220,22 +241,22 @@ class GenericParserTestCase(ZulipTestCase):
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
parser = GenericParser(html)
|
||||
parser = GenericParser(html, "text/html; charset=UTF-8")
|
||||
result = parser.extract_data()
|
||||
self.assertEqual(result.get('description'), 'Description text')
|
||||
|
||||
html = """
|
||||
html = b"""
|
||||
<html>
|
||||
<head><meta name="description" content="description 123"</head>
|
||||
<body></body>
|
||||
</html>
|
||||
"""
|
||||
parser = GenericParser(html)
|
||||
parser = GenericParser(html, "text/html; charset=UTF-8")
|
||||
result = parser.extract_data()
|
||||
self.assertEqual(result.get('description'), 'description 123')
|
||||
|
||||
html = "<html><body></body></html>"
|
||||
parser = GenericParser(html)
|
||||
html = b"<html><body></body></html>"
|
||||
parser = GenericParser(html, "text/html; charset=UTF-8")
|
||||
result = parser.extract_data()
|
||||
self.assertIsNone(result.get('description'))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user