upload: Attempt to guess character set text/* which are served inline.

This is only `text/plain`, currently.  In such cases where the
client-provided content-type also does not specify a `charset`, we use
`chardet` to make a guess, and store that guess to provide later when
serving the content.  The detection is done in a streaming fashion,
and thus should not require re-downloading the full content.
This commit is contained in:
Alex Vandiver
2025-07-18 05:55:07 +00:00
committed by Tim Abbott
parent ae001dfa96
commit 25fbb05fea
10 changed files with 76 additions and 20 deletions

View File

@@ -204,6 +204,9 @@ prod = [
# For E2EE of push notifications
"pynacl",
# Character set detection for text/plain
"chardet>=5.1.0"
]
docs = [
# Needed to build RTD docs
@@ -274,7 +277,6 @@ dev = [
"SQLAlchemy[mypy]",
"types-beautifulsoup4",
"types-boto",
"types-chardet",
"types-decorator",
"types-defusedxml",
"types-jsonschema",

24
uv.lock generated
View File

@@ -538,6 +538,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009, upload-time = "2024-09-04T20:44:45.309Z" },
]
[[package]]
name = "chardet"
version = "5.2.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618, upload-time = "2023-08-01T19:23:02.662Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385, upload-time = "2023-08-01T19:23:00.661Z" },
]
[[package]]
name = "charset-normalizer"
version = "3.4.2"
@@ -4700,15 +4709,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/f1/86/e26e6ae4dfcbf6031b8422c22cf3a9eb2b6d127770406e7645b6248d8091/types_cffi-1.17.0.20250523-py3-none-any.whl", hash = "sha256:e98c549d8e191f6220e440f9f14315d6775a21a0e588c32c20476be885b2fad9", size = 20010, upload-time = "2025-05-23T03:05:39.136Z" },
]
[[package]]
name = "types-chardet"
version = "5.0.4.6"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/dd/47/932d35ac07203e936e69102dc9570e83606d386bacb60696f0c403224e86/types-chardet-5.0.4.6.tar.gz", hash = "sha256:caf4c74cd13ccfd8b3313c314aba943b159de562a2573ed03137402b2bb37818", size = 4592, upload-time = "2023-05-10T15:22:21.325Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/10/35/2a06c5c892eb1a0a4f4f74a6aff1ade05da82444af0190cf731761f2c46c/types_chardet-5.0.4.6-py3-none-any.whl", hash = "sha256:ea832d87e798abf1e4dfc73767807c2b7fee35d0003ae90348aea4ae00fb004d", size = 5853, upload-time = "2023-05-10T15:22:19.797Z" },
]
[[package]]
name = "types-decorator"
version = "5.2.0.20250324"
@@ -5359,6 +5359,7 @@ dev = [
{ name = "black" },
{ name = "boto3" },
{ name = "boto3-stubs", extra = ["s3", "ses", "sns", "sqs"] },
{ name = "chardet" },
{ name = "circuitbreaker" },
{ name = "codespell" },
{ name = "coverage" },
@@ -5450,7 +5451,6 @@ dev = [
{ name = "tornado" },
{ name = "types-beautifulsoup4" },
{ name = "types-boto" },
{ name = "types-chardet" },
{ name = "types-decorator" },
{ name = "types-defusedxml" },
{ name = "types-jsonschema" },
@@ -5494,6 +5494,7 @@ prod = [
{ name = "backoff" },
{ name = "beautifulsoup4" },
{ name = "boto3" },
{ name = "chardet" },
{ name = "circuitbreaker" },
{ name = "cryptography" },
{ name = "css-inline" },
@@ -5582,6 +5583,7 @@ dev = [
{ name = "black" },
{ name = "boto3" },
{ name = "boto3-stubs", extras = ["s3", "ses", "sns", "sqs"] },
{ name = "chardet", specifier = ">=5.1.0" },
{ name = "circuitbreaker" },
{ name = "codespell" },
{ name = "coverage" },
@@ -5674,7 +5676,6 @@ dev = [
{ name = "tornado" },
{ name = "types-beautifulsoup4" },
{ name = "types-boto" },
{ name = "types-chardet" },
{ name = "types-decorator" },
{ name = "types-defusedxml" },
{ name = "types-jsonschema" },
@@ -5718,6 +5719,7 @@ prod = [
{ name = "backoff" },
{ name = "beautifulsoup4" },
{ name = "boto3" },
{ name = "chardet", specifier = ">=5.1.0" },
{ name = "circuitbreaker" },
{ name = "cryptography" },
{ name = "css-inline" },

View File

@@ -49,4 +49,4 @@ API_FEATURE_LEVEL = 408
# historical commits sharing the same major version, in which case a
# minor version bump suffices.
PROVISION_VERSION = (337, 1) # bumped 2025-07-21 to upgrade shfmt
PROVISION_VERSION = (338, 0) # bumped 2025-07-18 to add chardet and remove types-chardet

View File

@@ -9,6 +9,7 @@ from email.message import EmailMessage
from typing import IO, Any
from urllib.parse import unquote, urljoin
import chardet
import pyvips
from django.conf import settings
from django.core.files.uploadedfile import UploadedFile
@@ -46,6 +47,36 @@ def check_upload_within_quota(realm: Realm, uploaded_file_size: int) -> None:
raise RealmUploadQuotaError(_("Upload would exceed your organization's upload quota."))
def maybe_add_charset(content_type: str, file_data: bytes | StreamingSourceWithSize) -> str:
# We only add a charset if it doesn't already have one, and is a
# text type which we serve inline; currently, this is only text/plain.
fake_msg = EmailMessage()
fake_msg["content-type"] = content_type
if (
fake_msg.get_content_maintype() != "text"
or fake_msg.get_content_type() not in INLINE_MIME_TYPES
or fake_msg.get_content_charset() is not None
):
return content_type
if isinstance(file_data, bytes):
detected = chardet.detect(file_data)
else:
reader = file_data.reader()
detector = chardet.universaldetector.UniversalDetector()
while True:
data = reader.read(4096)
detector.feed(data)
if detector.done or len(data) < 4096:
break
detector.close()
reader.close()
detected = detector.result
if detected["confidence"] > 0.9 and detected["encoding"]:
fake_msg.set_param("charset", detected["encoding"], replace=True)
return fake_msg["content-type"]
def create_attachment(
file_name: str,
path_id: str,
@@ -63,6 +94,8 @@ def create_attachment(
else:
file_size = file_data.size
file_vips_data = file_data.vips_source
content_type = maybe_add_charset(content_type, file_data)
attachment = Attachment.objects.create(
file_name=file_name,
path_id=path_id,

View File

@@ -2,17 +2,24 @@ import os
from collections.abc import Callable, Iterator
from dataclasses import dataclass
from datetime import datetime
from typing import IO, Any
from typing import IO, Any, Protocol
import pyvips
from zerver.models import Realm, UserProfile
class ReadableStream(Protocol):
def read(self, size: int = -1) -> bytes: ...
def close(self) -> None: ...
@dataclass
class StreamingSourceWithSize:
size: int
vips_source: pyvips.Source
reader: Callable[[], ReadableStream]
class ZulipUploadBackend:

View File

@@ -110,6 +110,7 @@ class LocalUploadBackend(ZulipUploadBackend):
return StreamingSourceWithSize(
size=os.path.getsize(file_path),
vips_source=vips_source,
reader=lambda: open(file_path, "rb"),
)
@override

View File

@@ -284,6 +284,7 @@ class S3UploadBackend(ZulipUploadBackend):
return StreamingSourceWithSize(
size=metadata["ContentLength"],
vips_source=vips_source,
reader=lambda: metadata["Body"],
)
@override

View File

@@ -526,7 +526,7 @@ class TusdPreFinishTest(ZulipTestCase):
attachment = Attachment.objects.get(path_id=path_id)
self.assertEqual(attachment.size, len("zulip!"))
self.assertEqual(attachment.content_type, "text/plain")
self.assertEqual(attachment.content_type, 'text/plain; charset="ascii"')
# Assert that the .info file is still there -- tusd needs it
# to verify that the upload completed successfully

View File

@@ -199,11 +199,13 @@ class FileUploadTest(UploadSerializeMixin, ZulipTestCase):
url = response_dict["url"]
result = self.client_get(url)
self.assertEqual(result.status_code, 200)
self.assertEqual(result["Content-Type"], "text/plain")
self.assertEqual(result["Content-Type"], 'text/plain; charset="ascii"')
consume_response(result)
def test_preserve_provided_content_type(self) -> None:
uploaded_file = SimpleUploadedFile("somefile.txt", b"zulip!", content_type="image/png")
def test_guess_content_type_charset(self) -> None:
uploaded_file = SimpleUploadedFile(
"somefile.txt", "नाम में क्या रक्खा हे".encode(), content_type="text/plain"
)
result = self.api_post(
self.example_user("hamlet"), "/api/v1/user_uploads", {"file": uploaded_file}
)
@@ -213,7 +215,7 @@ class FileUploadTest(UploadSerializeMixin, ZulipTestCase):
url = response_dict["url"]
result = self.client_get(url)
self.assertEqual(result.status_code, 200)
self.assertEqual(result["Content-Type"], "image/png")
self.assertEqual(result["Content-Type"], 'text/plain; charset="utf-8"')
consume_response(result)
def test_content_type_charset_specified(self) -> None:

View File

@@ -2,6 +2,7 @@ import base64
import binascii
import os
from datetime import timedelta
from email.message import EmailMessage
from urllib.parse import quote, urlsplit
from django.conf import settings
@@ -110,6 +111,12 @@ def serve_s3(
return response
def bare_content_type(content_type: str) -> str:
fake_msg = EmailMessage()
fake_msg["content-type"] = content_type
return fake_msg.get_content_type()
def serve_local(
request: HttpRequest,
path_id: str,
@@ -125,7 +132,8 @@ def serve_local(
if content_type is None:
content_type = guess_type(filename)[0]
download = force_download or content_type not in INLINE_MIME_TYPES
assert content_type is not None
download = force_download or bare_content_type(content_type) not in INLINE_MIME_TYPES
if settings.DEVELOPMENT:
# In development, we do not have the nginx server to offload