tusd: Remove non-ASCII characters from path-ids.

This commit is contained in:
Alex Vandiver
2024-09-25 19:02:46 +00:00
committed by Tim Abbott
parent 84280ed7c2
commit 287850d08d
3 changed files with 14 additions and 7 deletions

View File

@@ -114,20 +114,27 @@ def get_public_upload_root_url() -> str:
return upload_backend.get_public_upload_root_url()
def sanitize_name(value: str) -> str:
"""
Sanitizes a value to be safe to store in a Linux filesystem, in
def sanitize_name(value: str, *, strict: bool = False) -> str:
"""Sanitizes a value to be safe to store in a Linux filesystem, in
S3, and in a URL. So Unicode is allowed, but not special
characters other than ".", "-", and "_".
In "strict" mode, it does not allow Unicode, allowing only ASCII
[A-Za-z0-9_] as word characters. This is for the benefit of tusd,
which is not Unicode-aware.
This implementation is based on django.utils.text.slugify; it is
modified by:
* adding '.' to the list of allowed characters.
* preserving the case of the value.
* not stripping trailing dashes and underscores.
"""
value = unicodedata.normalize("NFKC", value)
value = re.sub(r"[^\w\s.-]", "", value).strip()
if strict:
value = re.sub(r"[^A-Za-z0-9_ .-]", "", value).strip()
else:
value = unicodedata.normalize("NFKC", value)
value = re.sub(r"[^\w\s.-]", "", value).strip()
value = re.sub(r"[-\s]+", "-", value)
# Django's MultiPartParser never returns files named this, but we