mirror of
https://github.com/zulip/zulip.git
synced 2025-11-10 08:56:10 +00:00
url_preview: Allow Beautiful Soup to get the charset from <meta>.
An HTML document sent without a charset in the Content-Type header needs to be scanned for a charset in <meta> tags. We need to pass bytes instead of str to Beautiful Soup to allow it to do this. Fixes #16843. Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
committed by
Tim Abbott
parent
daac7536f3
commit
bf45f921a7
@@ -314,6 +314,7 @@ class HostRequestMock:
|
|||||||
|
|
||||||
class MockPythonResponse:
|
class MockPythonResponse:
|
||||||
def __init__(self, text: str, status_code: int, headers: Optional[Dict[str, str]]=None) -> None:
|
def __init__(self, text: str, status_code: int, headers: Optional[Dict[str, str]]=None) -> None:
|
||||||
|
self.content = text.encode()
|
||||||
self.text = text
|
self.text = text
|
||||||
self.status_code = status_code
|
self.status_code = status_code
|
||||||
if headers is None:
|
if headers is None:
|
||||||
|
|||||||
@@ -1,13 +1,17 @@
|
|||||||
from typing import Any
|
import cgi
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
|
||||||
class BaseParser:
|
class BaseParser:
|
||||||
def __init__(self, html_source: str) -> None:
|
def __init__(self, html_source: bytes, content_type: Optional[str]) -> None:
|
||||||
# We import BeautifulSoup here, because it's not used by most
|
# We import BeautifulSoup here, because it's not used by most
|
||||||
# processes in production, and bs4 is big enough that
|
# processes in production, and bs4 is big enough that
|
||||||
# importing it adds 10s of milliseconds to manage.py startup.
|
# importing it adds 10s of milliseconds to manage.py startup.
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
self._soup = BeautifulSoup(html_source, "lxml")
|
charset = None
|
||||||
|
if content_type is not None:
|
||||||
|
charset = cgi.parse_header(content_type)[1].get("charset")
|
||||||
|
self._soup = BeautifulSoup(html_source, "lxml", from_encoding=charset)
|
||||||
|
|
||||||
def extract_data(self) -> Any:
|
def extract_data(self) -> Any:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|||||||
@@ -91,12 +91,16 @@ def get_link_embed_data(url: str,
|
|||||||
|
|
||||||
response = requests.get(mark_sanitized(url), stream=True, headers=HEADERS, timeout=TIMEOUT)
|
response = requests.get(mark_sanitized(url), stream=True, headers=HEADERS, timeout=TIMEOUT)
|
||||||
if response.ok:
|
if response.ok:
|
||||||
og_data = OpenGraphParser(response.text).extract_data()
|
og_data = OpenGraphParser(
|
||||||
|
response.content, response.headers.get("Content-Type")
|
||||||
|
).extract_data()
|
||||||
for key in ['title', 'description', 'image']:
|
for key in ['title', 'description', 'image']:
|
||||||
if not data.get(key) and og_data.get(key):
|
if not data.get(key) and og_data.get(key):
|
||||||
data[key] = og_data[key]
|
data[key] = og_data[key]
|
||||||
|
|
||||||
generic_data = GenericParser(response.text).extract_data() or {}
|
generic_data = GenericParser(
|
||||||
|
response.content, response.headers.get("Content-Type")
|
||||||
|
).extract_data() or {}
|
||||||
for key in ['title', 'description', 'image']:
|
for key in ['title', 'description', 'image']:
|
||||||
if not data.get(key) and generic_data.get(key):
|
if not data.get(key) and generic_data.get(key):
|
||||||
data[key] = generic_data[key]
|
data[key] = generic_data[key]
|
||||||
|
|||||||
@@ -136,7 +136,7 @@ class OembedTestCase(ZulipTestCase):
|
|||||||
|
|
||||||
class OpenGraphParserTestCase(ZulipTestCase):
|
class OpenGraphParserTestCase(ZulipTestCase):
|
||||||
def test_page_with_og(self) -> None:
|
def test_page_with_og(self) -> None:
|
||||||
html = """<html>
|
html = b"""<html>
|
||||||
<head>
|
<head>
|
||||||
<meta property="og:title" content="The Rock" />
|
<meta property="og:title" content="The Rock" />
|
||||||
<meta property="og:type" content="video.movie" />
|
<meta property="og:type" content="video.movie" />
|
||||||
@@ -146,14 +146,14 @@ class OpenGraphParserTestCase(ZulipTestCase):
|
|||||||
</head>
|
</head>
|
||||||
</html>"""
|
</html>"""
|
||||||
|
|
||||||
parser = OpenGraphParser(html)
|
parser = OpenGraphParser(html, "text/html; charset=UTF-8")
|
||||||
result = parser.extract_data()
|
result = parser.extract_data()
|
||||||
self.assertIn('title', result)
|
self.assertIn('title', result)
|
||||||
self.assertEqual(result['title'], 'The Rock')
|
self.assertEqual(result['title'], 'The Rock')
|
||||||
self.assertEqual(result.get('description'), 'The Rock film')
|
self.assertEqual(result.get('description'), 'The Rock film')
|
||||||
|
|
||||||
def test_page_with_evil_og_tags(self) -> None:
|
def test_page_with_evil_og_tags(self) -> None:
|
||||||
html = """<html>
|
html = b"""<html>
|
||||||
<head>
|
<head>
|
||||||
<meta property="og:title" content="The Rock" />
|
<meta property="og:title" content="The Rock" />
|
||||||
<meta property="og:type" content="video.movie" />
|
<meta property="og:type" content="video.movie" />
|
||||||
@@ -165,7 +165,7 @@ class OpenGraphParserTestCase(ZulipTestCase):
|
|||||||
</head>
|
</head>
|
||||||
</html>"""
|
</html>"""
|
||||||
|
|
||||||
parser = OpenGraphParser(html)
|
parser = OpenGraphParser(html, "text/html; charset=UTF-8")
|
||||||
result = parser.extract_data()
|
result = parser.extract_data()
|
||||||
self.assertIn('title', result)
|
self.assertIn('title', result)
|
||||||
self.assertEqual(result['title'], 'The Rock')
|
self.assertEqual(result['title'], 'The Rock')
|
||||||
@@ -173,9 +173,30 @@ class OpenGraphParserTestCase(ZulipTestCase):
|
|||||||
self.assertEqual(result.get('oembed'), None)
|
self.assertEqual(result.get('oembed'), None)
|
||||||
self.assertEqual(result.get('html'), None)
|
self.assertEqual(result.get('html'), None)
|
||||||
|
|
||||||
|
def test_charset_in_header(self) -> None:
|
||||||
|
html = """<html>
|
||||||
|
<head>
|
||||||
|
<meta property="og:title" content="中文" />
|
||||||
|
</head>
|
||||||
|
</html>""".encode("big5")
|
||||||
|
parser = OpenGraphParser(html, "text/html; charset=Big5")
|
||||||
|
result = parser.extract_data()
|
||||||
|
self.assertEqual(result["title"], "中文")
|
||||||
|
|
||||||
|
def test_charset_in_meta(self) -> None:
|
||||||
|
html = """<html>
|
||||||
|
<head>
|
||||||
|
<meta content-type="text/html; charset=Big5" />
|
||||||
|
<meta property="og:title" content="中文" />
|
||||||
|
</head>
|
||||||
|
</html>""".encode("big5")
|
||||||
|
parser = OpenGraphParser(html, "text/html")
|
||||||
|
result = parser.extract_data()
|
||||||
|
self.assertEqual(result["title"], "中文")
|
||||||
|
|
||||||
class GenericParserTestCase(ZulipTestCase):
|
class GenericParserTestCase(ZulipTestCase):
|
||||||
def test_parser(self) -> None:
|
def test_parser(self) -> None:
|
||||||
html = """
|
html = b"""
|
||||||
<html>
|
<html>
|
||||||
<head><title>Test title</title></head>
|
<head><title>Test title</title></head>
|
||||||
<body>
|
<body>
|
||||||
@@ -184,13 +205,13 @@ class GenericParserTestCase(ZulipTestCase):
|
|||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
parser = GenericParser(html)
|
parser = GenericParser(html, "text/html; charset=UTF-8")
|
||||||
result = parser.extract_data()
|
result = parser.extract_data()
|
||||||
self.assertEqual(result.get('title'), 'Test title')
|
self.assertEqual(result.get('title'), 'Test title')
|
||||||
self.assertEqual(result.get('description'), 'Description text')
|
self.assertEqual(result.get('description'), 'Description text')
|
||||||
|
|
||||||
def test_extract_image(self) -> None:
|
def test_extract_image(self) -> None:
|
||||||
html = """
|
html = b"""
|
||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
<h1>Main header</h1>
|
<h1>Main header</h1>
|
||||||
@@ -202,14 +223,14 @@ class GenericParserTestCase(ZulipTestCase):
|
|||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
parser = GenericParser(html)
|
parser = GenericParser(html, "text/html; charset=UTF-8")
|
||||||
result = parser.extract_data()
|
result = parser.extract_data()
|
||||||
self.assertEqual(result.get('title'), 'Main header')
|
self.assertEqual(result.get('title'), 'Main header')
|
||||||
self.assertEqual(result.get('description'), 'Description text')
|
self.assertEqual(result.get('description'), 'Description text')
|
||||||
self.assertEqual(result.get('image'), 'http://test.com/test.jpg')
|
self.assertEqual(result.get('image'), 'http://test.com/test.jpg')
|
||||||
|
|
||||||
def test_extract_description(self) -> None:
|
def test_extract_description(self) -> None:
|
||||||
html = """
|
html = b"""
|
||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
<div>
|
<div>
|
||||||
@@ -220,22 +241,22 @@ class GenericParserTestCase(ZulipTestCase):
|
|||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
parser = GenericParser(html)
|
parser = GenericParser(html, "text/html; charset=UTF-8")
|
||||||
result = parser.extract_data()
|
result = parser.extract_data()
|
||||||
self.assertEqual(result.get('description'), 'Description text')
|
self.assertEqual(result.get('description'), 'Description text')
|
||||||
|
|
||||||
html = """
|
html = b"""
|
||||||
<html>
|
<html>
|
||||||
<head><meta name="description" content="description 123"</head>
|
<head><meta name="description" content="description 123"</head>
|
||||||
<body></body>
|
<body></body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
parser = GenericParser(html)
|
parser = GenericParser(html, "text/html; charset=UTF-8")
|
||||||
result = parser.extract_data()
|
result = parser.extract_data()
|
||||||
self.assertEqual(result.get('description'), 'description 123')
|
self.assertEqual(result.get('description'), 'description 123')
|
||||||
|
|
||||||
html = "<html><body></body></html>"
|
html = b"<html><body></body></html>"
|
||||||
parser = GenericParser(html)
|
parser = GenericParser(html, "text/html; charset=UTF-8")
|
||||||
result = parser.extract_data()
|
result = parser.extract_data()
|
||||||
self.assertIsNone(result.get('description'))
|
self.assertIsNone(result.get('description'))
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user