mirror of
https://github.com/zulip/zulip.git
synced 2025-10-23 04:52:12 +00:00
upload: Attempt to guess character set text/* which are served inline.
This is only `text/plain`, currently. In such cases where the client-provided content-type also does not specify a `charset`, we use `chardet` to make a guess, and store that guess to provide later when serving the content. The detection is done in a streaming fashion, and thus should not require re-downloading the full content.
This commit is contained in:
committed by
Tim Abbott
parent
ae001dfa96
commit
25fbb05fea
@@ -204,6 +204,9 @@ prod = [
|
||||
|
||||
# For E2EE of push notifications
|
||||
"pynacl",
|
||||
|
||||
# Character set detection for text/plain
|
||||
"chardet>=5.1.0"
|
||||
]
|
||||
docs = [
|
||||
# Needed to build RTD docs
|
||||
@@ -274,7 +277,6 @@ dev = [
|
||||
"SQLAlchemy[mypy]",
|
||||
"types-beautifulsoup4",
|
||||
"types-boto",
|
||||
"types-chardet",
|
||||
"types-decorator",
|
||||
"types-defusedxml",
|
||||
"types-jsonschema",
|
||||
|
24
uv.lock
generated
24
uv.lock
generated
@@ -538,6 +538,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009, upload-time = "2024-09-04T20:44:45.309Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chardet"
|
||||
version = "5.2.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618, upload-time = "2023-08-01T19:23:02.662Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385, upload-time = "2023-08-01T19:23:00.661Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "charset-normalizer"
|
||||
version = "3.4.2"
|
||||
@@ -4700,15 +4709,6 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f1/86/e26e6ae4dfcbf6031b8422c22cf3a9eb2b6d127770406e7645b6248d8091/types_cffi-1.17.0.20250523-py3-none-any.whl", hash = "sha256:e98c549d8e191f6220e440f9f14315d6775a21a0e588c32c20476be885b2fad9", size = 20010, upload-time = "2025-05-23T03:05:39.136Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "types-chardet"
|
||||
version = "5.0.4.6"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/dd/47/932d35ac07203e936e69102dc9570e83606d386bacb60696f0c403224e86/types-chardet-5.0.4.6.tar.gz", hash = "sha256:caf4c74cd13ccfd8b3313c314aba943b159de562a2573ed03137402b2bb37818", size = 4592, upload-time = "2023-05-10T15:22:21.325Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/10/35/2a06c5c892eb1a0a4f4f74a6aff1ade05da82444af0190cf731761f2c46c/types_chardet-5.0.4.6-py3-none-any.whl", hash = "sha256:ea832d87e798abf1e4dfc73767807c2b7fee35d0003ae90348aea4ae00fb004d", size = 5853, upload-time = "2023-05-10T15:22:19.797Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "types-decorator"
|
||||
version = "5.2.0.20250324"
|
||||
@@ -5359,6 +5359,7 @@ dev = [
|
||||
{ name = "black" },
|
||||
{ name = "boto3" },
|
||||
{ name = "boto3-stubs", extra = ["s3", "ses", "sns", "sqs"] },
|
||||
{ name = "chardet" },
|
||||
{ name = "circuitbreaker" },
|
||||
{ name = "codespell" },
|
||||
{ name = "coverage" },
|
||||
@@ -5450,7 +5451,6 @@ dev = [
|
||||
{ name = "tornado" },
|
||||
{ name = "types-beautifulsoup4" },
|
||||
{ name = "types-boto" },
|
||||
{ name = "types-chardet" },
|
||||
{ name = "types-decorator" },
|
||||
{ name = "types-defusedxml" },
|
||||
{ name = "types-jsonschema" },
|
||||
@@ -5494,6 +5494,7 @@ prod = [
|
||||
{ name = "backoff" },
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "boto3" },
|
||||
{ name = "chardet" },
|
||||
{ name = "circuitbreaker" },
|
||||
{ name = "cryptography" },
|
||||
{ name = "css-inline" },
|
||||
@@ -5582,6 +5583,7 @@ dev = [
|
||||
{ name = "black" },
|
||||
{ name = "boto3" },
|
||||
{ name = "boto3-stubs", extras = ["s3", "ses", "sns", "sqs"] },
|
||||
{ name = "chardet", specifier = ">=5.1.0" },
|
||||
{ name = "circuitbreaker" },
|
||||
{ name = "codespell" },
|
||||
{ name = "coverage" },
|
||||
@@ -5674,7 +5676,6 @@ dev = [
|
||||
{ name = "tornado" },
|
||||
{ name = "types-beautifulsoup4" },
|
||||
{ name = "types-boto" },
|
||||
{ name = "types-chardet" },
|
||||
{ name = "types-decorator" },
|
||||
{ name = "types-defusedxml" },
|
||||
{ name = "types-jsonschema" },
|
||||
@@ -5718,6 +5719,7 @@ prod = [
|
||||
{ name = "backoff" },
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "boto3" },
|
||||
{ name = "chardet", specifier = ">=5.1.0" },
|
||||
{ name = "circuitbreaker" },
|
||||
{ name = "cryptography" },
|
||||
{ name = "css-inline" },
|
||||
|
@@ -49,4 +49,4 @@ API_FEATURE_LEVEL = 408
|
||||
# historical commits sharing the same major version, in which case a
|
||||
# minor version bump suffices.
|
||||
|
||||
PROVISION_VERSION = (337, 1) # bumped 2025-07-21 to upgrade shfmt
|
||||
PROVISION_VERSION = (338, 0) # bumped 2025-07-18 to add chardet and remove types-chardet
|
||||
|
@@ -9,6 +9,7 @@ from email.message import EmailMessage
|
||||
from typing import IO, Any
|
||||
from urllib.parse import unquote, urljoin
|
||||
|
||||
import chardet
|
||||
import pyvips
|
||||
from django.conf import settings
|
||||
from django.core.files.uploadedfile import UploadedFile
|
||||
@@ -46,6 +47,36 @@ def check_upload_within_quota(realm: Realm, uploaded_file_size: int) -> None:
|
||||
raise RealmUploadQuotaError(_("Upload would exceed your organization's upload quota."))
|
||||
|
||||
|
||||
def maybe_add_charset(content_type: str, file_data: bytes | StreamingSourceWithSize) -> str:
|
||||
# We only add a charset if it doesn't already have one, and is a
|
||||
# text type which we serve inline; currently, this is only text/plain.
|
||||
fake_msg = EmailMessage()
|
||||
fake_msg["content-type"] = content_type
|
||||
if (
|
||||
fake_msg.get_content_maintype() != "text"
|
||||
or fake_msg.get_content_type() not in INLINE_MIME_TYPES
|
||||
or fake_msg.get_content_charset() is not None
|
||||
):
|
||||
return content_type
|
||||
|
||||
if isinstance(file_data, bytes):
|
||||
detected = chardet.detect(file_data)
|
||||
else:
|
||||
reader = file_data.reader()
|
||||
detector = chardet.universaldetector.UniversalDetector()
|
||||
while True:
|
||||
data = reader.read(4096)
|
||||
detector.feed(data)
|
||||
if detector.done or len(data) < 4096:
|
||||
break
|
||||
detector.close()
|
||||
reader.close()
|
||||
detected = detector.result
|
||||
if detected["confidence"] > 0.9 and detected["encoding"]:
|
||||
fake_msg.set_param("charset", detected["encoding"], replace=True)
|
||||
return fake_msg["content-type"]
|
||||
|
||||
|
||||
def create_attachment(
|
||||
file_name: str,
|
||||
path_id: str,
|
||||
@@ -63,6 +94,8 @@ def create_attachment(
|
||||
else:
|
||||
file_size = file_data.size
|
||||
file_vips_data = file_data.vips_source
|
||||
|
||||
content_type = maybe_add_charset(content_type, file_data)
|
||||
attachment = Attachment.objects.create(
|
||||
file_name=file_name,
|
||||
path_id=path_id,
|
||||
|
@@ -2,17 +2,24 @@ import os
|
||||
from collections.abc import Callable, Iterator
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import IO, Any
|
||||
from typing import IO, Any, Protocol
|
||||
|
||||
import pyvips
|
||||
|
||||
from zerver.models import Realm, UserProfile
|
||||
|
||||
|
||||
class ReadableStream(Protocol):
|
||||
def read(self, size: int = -1) -> bytes: ...
|
||||
|
||||
def close(self) -> None: ...
|
||||
|
||||
|
||||
@dataclass
|
||||
class StreamingSourceWithSize:
|
||||
size: int
|
||||
vips_source: pyvips.Source
|
||||
reader: Callable[[], ReadableStream]
|
||||
|
||||
|
||||
class ZulipUploadBackend:
|
||||
|
@@ -110,6 +110,7 @@ class LocalUploadBackend(ZulipUploadBackend):
|
||||
return StreamingSourceWithSize(
|
||||
size=os.path.getsize(file_path),
|
||||
vips_source=vips_source,
|
||||
reader=lambda: open(file_path, "rb"),
|
||||
)
|
||||
|
||||
@override
|
||||
|
@@ -284,6 +284,7 @@ class S3UploadBackend(ZulipUploadBackend):
|
||||
return StreamingSourceWithSize(
|
||||
size=metadata["ContentLength"],
|
||||
vips_source=vips_source,
|
||||
reader=lambda: metadata["Body"],
|
||||
)
|
||||
|
||||
@override
|
||||
|
@@ -526,7 +526,7 @@ class TusdPreFinishTest(ZulipTestCase):
|
||||
|
||||
attachment = Attachment.objects.get(path_id=path_id)
|
||||
self.assertEqual(attachment.size, len("zulip!"))
|
||||
self.assertEqual(attachment.content_type, "text/plain")
|
||||
self.assertEqual(attachment.content_type, 'text/plain; charset="ascii"')
|
||||
|
||||
# Assert that the .info file is still there -- tusd needs it
|
||||
# to verify that the upload completed successfully
|
||||
|
@@ -199,11 +199,13 @@ class FileUploadTest(UploadSerializeMixin, ZulipTestCase):
|
||||
url = response_dict["url"]
|
||||
result = self.client_get(url)
|
||||
self.assertEqual(result.status_code, 200)
|
||||
self.assertEqual(result["Content-Type"], "text/plain")
|
||||
self.assertEqual(result["Content-Type"], 'text/plain; charset="ascii"')
|
||||
consume_response(result)
|
||||
|
||||
def test_preserve_provided_content_type(self) -> None:
|
||||
uploaded_file = SimpleUploadedFile("somefile.txt", b"zulip!", content_type="image/png")
|
||||
def test_guess_content_type_charset(self) -> None:
|
||||
uploaded_file = SimpleUploadedFile(
|
||||
"somefile.txt", "नाम में क्या रक्खा हे".encode(), content_type="text/plain"
|
||||
)
|
||||
result = self.api_post(
|
||||
self.example_user("hamlet"), "/api/v1/user_uploads", {"file": uploaded_file}
|
||||
)
|
||||
@@ -213,7 +215,7 @@ class FileUploadTest(UploadSerializeMixin, ZulipTestCase):
|
||||
url = response_dict["url"]
|
||||
result = self.client_get(url)
|
||||
self.assertEqual(result.status_code, 200)
|
||||
self.assertEqual(result["Content-Type"], "image/png")
|
||||
self.assertEqual(result["Content-Type"], 'text/plain; charset="utf-8"')
|
||||
consume_response(result)
|
||||
|
||||
def test_content_type_charset_specified(self) -> None:
|
||||
|
@@ -2,6 +2,7 @@ import base64
|
||||
import binascii
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from email.message import EmailMessage
|
||||
from urllib.parse import quote, urlsplit
|
||||
|
||||
from django.conf import settings
|
||||
@@ -110,6 +111,12 @@ def serve_s3(
|
||||
return response
|
||||
|
||||
|
||||
def bare_content_type(content_type: str) -> str:
|
||||
fake_msg = EmailMessage()
|
||||
fake_msg["content-type"] = content_type
|
||||
return fake_msg.get_content_type()
|
||||
|
||||
|
||||
def serve_local(
|
||||
request: HttpRequest,
|
||||
path_id: str,
|
||||
@@ -125,7 +132,8 @@ def serve_local(
|
||||
|
||||
if content_type is None:
|
||||
content_type = guess_type(filename)[0]
|
||||
download = force_download or content_type not in INLINE_MIME_TYPES
|
||||
assert content_type is not None
|
||||
download = force_download or bare_content_type(content_type) not in INLINE_MIME_TYPES
|
||||
|
||||
if settings.DEVELOPMENT:
|
||||
# In development, we do not have the nginx server to offload
|
||||
|
Reference in New Issue
Block a user