Add Attachment model to keep track of uploads.

This commit adds the capability to keep track and remove uploaded
files.  Unclaimed attachments are files that have been uploaded to the
server but are not referred in any messages.  A management command to
remove old unclaimed files after a week is also included.

Tests for getting the file referred in messages are also included.
This commit is contained in:
rahuldeve
2016-03-25 00:54:01 +05:30
committed by Tim Abbott
parent 391a225595
commit dde832b158
7 changed files with 294 additions and 7 deletions

View File

@@ -8,7 +8,7 @@ from django.contrib.sessions.models import Session
from zerver.lib.cache import flush_user_profile from zerver.lib.cache import flush_user_profile
from zerver.lib.context_managers import lockfile from zerver.lib.context_managers import lockfile
from zerver.models import Realm, RealmEmoji, Stream, UserProfile, UserActivity, \ from zerver.models import Realm, RealmEmoji, Stream, UserProfile, UserActivity, \
Subscription, Recipient, Message, UserMessage, valid_stream_name, \ Subscription, Recipient, Message, Attachment, UserMessage, valid_stream_name, \
DefaultStream, UserPresence, Referral, PushDeviceToken, MAX_SUBJECT_LENGTH, \ DefaultStream, UserPresence, Referral, PushDeviceToken, MAX_SUBJECT_LENGTH, \
MAX_MESSAGE_LENGTH, get_client, get_stream, get_recipient, get_huddle, \ MAX_MESSAGE_LENGTH, get_client, get_stream, get_recipient, get_huddle, \
get_user_profile_by_id, PreregistrationUser, get_display_recipient, \ get_user_profile_by_id, PreregistrationUser, get_display_recipient, \
@@ -17,7 +17,8 @@ from zerver.models import Realm, RealmEmoji, Stream, UserProfile, UserActivity,
get_user_profile_by_email, get_stream_cache_key, to_dict_cache_key_id, \ get_user_profile_by_email, get_stream_cache_key, to_dict_cache_key_id, \
UserActivityInterval, get_active_user_dicts_in_realm, get_active_streams, \ UserActivityInterval, get_active_user_dicts_in_realm, get_active_streams, \
realm_filters_for_domain, RealmFilter, receives_offline_notifications, \ realm_filters_for_domain, RealmFilter, receives_offline_notifications, \
ScheduledJob, realm_filters_for_domain, get_active_bot_dicts_in_realm ScheduledJob, realm_filters_for_domain, get_active_bot_dicts_in_realm, \
get_old_unclaimed_attachments
from zerver.lib.avatar import get_avatar_url, avatar_url from zerver.lib.avatar import get_avatar_url, avatar_url
@@ -57,6 +58,7 @@ from zerver.lib.push_notifications import num_push_devices_for_user, \
from zerver.lib.notifications import clear_followup_emails_queue from zerver.lib.notifications import clear_followup_emails_queue
from zerver.lib.narrow import check_supported_events_narrow_filter from zerver.lib.narrow import check_supported_events_narrow_filter
from zerver.lib.session_user import get_session_user from zerver.lib.session_user import get_session_user
from zerver.lib.upload import claim_attachment, delete_message_image
import DNS import DNS
import ujson import ujson
@@ -609,6 +611,11 @@ def do_send_messages(messages):
ums.extend(ums_to_create) ums.extend(ums_to_create)
UserMessage.objects.bulk_create(ums) UserMessage.objects.bulk_create(ums)
# Claim attachments in message
for message in messages:
if Message.content_has_attachment(message['message'].content):
do_claim_attachments(message)
for message in messages: for message in messages:
cache_save_message(message['message']) cache_save_message(message['message'])
# Render Markdown etc. here and store (automatically) in # Render Markdown etc. here and store (automatically) in
@@ -2998,3 +3005,26 @@ def do_get_streams(user_profile, include_public=True, include_subscribed=True,
streams.sort(key=lambda elt: elt["name"]) streams.sort(key=lambda elt: elt["name"])
return streams return streams
def do_claim_attachments(message):
atttachment_url_re = re.compile('[/\-]user[\-_]uploads[/\.-].*?(?=[ )]|\Z)')
attachment_url_list = atttachment_url_re.findall(message['message'].content)
results = []
for url in attachment_url_list:
path_id = re.sub('[/\-]user[\-_]uploads[/\.-]', '', url)
# Remove any extra '.' after file extension. These are probably added by the user
path_id = re.sub('[.]+$', '', path_id, re.M)
if path_id is not None:
is_claimed = claim_attachment(path_id, message['message'])
results.append((path_id, is_claimed))
return results
def do_delete_old_unclaimed_attachments(weeks_ago):
old_unclaimed_attachments = get_old_unclaimed_attachments(weeks_ago)
for attachment in old_unclaimed_attachments:
delete_message_image(attachment.path_id)
attachment.delete()

View File

@@ -14,6 +14,7 @@ from boto.s3.connection import S3Connection
from mimetypes import guess_type, guess_extension from mimetypes import guess_type, guess_extension
from zerver.models import get_user_profile_by_id from zerver.models import get_user_profile_by_id
from zerver.models import Attachment
import base64 import base64
import os import os
@@ -21,6 +22,7 @@ import re
from PIL import Image, ImageOps from PIL import Image, ImageOps
from six.moves import cStringIO as StringIO from six.moves import cStringIO as StringIO
import random import random
import logging
# Performance Note: # Performance Note:
# #
@@ -133,6 +135,8 @@ def upload_message_image_s3(uploaded_file_name, content_type, file_data, user_pr
user_profile, user_profile,
file_data file_data
) )
create_attachment(uploaded_file_name, s3_file_name, user_profile)
return url return url
def get_signed_upload_url(path): def get_signed_upload_url(path):
@@ -147,6 +151,20 @@ def get_realm_for_filename(path):
return None return None
return get_user_profile_by_id(key.metadata["user_profile_id"]).realm.id return get_user_profile_by_id(key.metadata["user_profile_id"]).realm.id
def delete_message_image_s3(path_id):
conn = S3Connection(settings.S3_KEY, settings.S3_SECRET_KEY)
bucket = get_bucket(conn, settings.S3_AUTH_UPLOADS_BUCKET)
# check if file exists
key = bucket.get_key(path_id)
if key is not None:
bucket.delete_key(key)
return True
file_name = path_id.split("/")[-1]
logging.warning("%s does not exist. Its entry in the database will be removed." % (file_name,))
return False
def upload_avatar_image_s3(user_file, user_profile, email): def upload_avatar_image_s3(user_file, user_profile, email):
content_type = guess_type(user_file.name)[0] content_type = guess_type(user_file.name)[0]
bucket_name = settings.S3_AVATAR_BUCKET bucket_name = settings.S3_AVATAR_BUCKET
@@ -195,9 +213,20 @@ def upload_message_image_local(uploaded_file_name, content_type, file_data, user
]) ])
write_local_file('files', path, file_data) write_local_file('files', path, file_data)
create_attachment(uploaded_file_name, path, user_profile)
return '/user_uploads/' + path return '/user_uploads/' + path
def delete_message_image_local(path_id):
file_path = os.path.join(settings.LOCAL_UPLOADS_DIR, 'files', path_id)
if os.path.isfile(file_path):
# This removes the file but the empty folders still remain.
os.remove(file_path)
return True
file_name = path_id.split("/")[-1]
logging.warning("%s does not exist. Its entry in the database will be removed." % (file_name,))
return False
def upload_avatar_image_local(user_file, user_profile, email): def upload_avatar_image_local(user_file, user_profile, email):
email_hash = user_avatar_hash(email) email_hash = user_avatar_hash(email)
@@ -212,9 +241,25 @@ def upload_avatar_image_local(user_file, user_profile, email):
if settings.LOCAL_UPLOADS_DIR is not None: if settings.LOCAL_UPLOADS_DIR is not None:
upload_message_image = upload_message_image_local upload_message_image = upload_message_image_local
upload_avatar_image = upload_avatar_image_local upload_avatar_image = upload_avatar_image_local
delete_message_image = delete_message_image_local
else: else:
upload_message_image = upload_message_image_s3 upload_message_image = upload_message_image_s3
upload_avatar_image = upload_avatar_image_s3 upload_avatar_image = upload_avatar_image_s3
delete_message_image = delete_message_image_s3
def claim_attachment(path_id, message):
try:
attachment = Attachment.objects.get(path_id=path_id)
attachment.messages.add(message)
attachment.save()
return True
except Attachment.DoesNotExist:
raise JsonableError("The upload was not successful. Please reupload the file again in a new message.")
return False
def create_attachment(file_name, path_id, user_profile):
Attachment.objects.create(file_name=file_name, path_id=path_id, owner=user_profile)
return True
def upload_message_image_through_web_client(request, user_file, user_profile): def upload_message_image_through_web_client(request, user_file, user_profile):
uploaded_file_name, content_type = get_file_info(request, user_file) uploaded_file_name, content_type = get_file_info(request, user_file)

View File

@@ -0,0 +1,44 @@
from __future__ import absolute_import
from __future__ import print_function
from django.core.management.base import BaseCommand
from zerver.lib.actions import do_delete_old_unclaimed_attachments
from zerver.models import Attachment, get_old_unclaimed_attachments
class Command(BaseCommand):
help = """Remove unclaimed attachments from storage older than a supplied
numerical value indicating the limit of how old the attachment can be.
One week is taken as the default value."""
def add_arguments(self, parser):
parser.add_argument('-w', '--weeks',
dest='delta_weeks',
default=1,
help="Limiting value of how old the file can be.")
parser.add_argument('-f', '--for-real',
dest='for_real',
action='store_true',
default=False,
help="Actually remove the files from the storage.")
def handle(self, *args, **options):
delta_weeks = options['delta_weeks']
print("Deleting unclaimed attached files older than %s" % (delta_weeks,))
print("")
# print the list of files that are going to be removed
old_attachments = get_old_unclaimed_attachments(delta_weeks)
for old_attachment in old_attachments:
print("%s created at %s" % (old_attachment.file_name, old_attachment.create_time))
print("")
if not options["for_real"]:
print("This was a dry run. Pass -f to actually delete.")
exit(1)
do_delete_old_unclaimed_attachments(delta_weeks)
print("")
print("Unclaimed Files deleted.")

View File

@@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import models, migrations
import django.utils.timezone
from django.conf import settings
class Migration(migrations.Migration):
dependencies = [
('zerver', '0014_realm_emoji_url_length'),
]
operations = [
migrations.CreateModel(
name='Attachment',
fields=[
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
('file_name', models.CharField(max_length=100, db_index=True)),
('path_id', models.TextField(db_index=True)),
('create_time', models.DateTimeField(default=django.utils.timezone.now, db_index=True)),
('messages', models.ManyToManyField(to='zerver.Message')),
('owner', models.ForeignKey(to=settings.AUTH_USER_MODEL)),
],
),
]

View File

@@ -30,6 +30,8 @@ import pylibmc
import re import re
import ujson import ujson
import logging import logging
import time
import datetime
bugdown = None # type: Any bugdown = None # type: Any
@@ -1066,6 +1068,39 @@ def parse_usermessage_flags(val):
mask <<= 1 mask <<= 1
return flags return flags
class Attachment(models.Model):
MAX_FILENAME_LENGTH = 100
file_name = models.CharField(max_length=MAX_FILENAME_LENGTH, db_index=True)
# path_id is a storage location agnostic representation of the path of the file.
# If the path of a file is http://localhost:9991/user_uploads/a/b/abc/temp_file.py
# then its path_id will be a/b/abc/temp_file.py.
path_id = models.TextField(db_index=True)
owner = models.ForeignKey(UserProfile)
messages = models.ManyToManyField(Message)
create_time = models.DateTimeField(default=timezone.now, db_index=True)
def __repr__(self):
return (u"<Attachment: %s>" % (self.file_name))
def is_claimed(self):
return self.messages.count() > 0
def get_url(self):
return "/user_uploads/%s" % (self.path_id)
def get_attachments_by_owner_id(uid):
return Attachment.objects.filter(owner=uid).select_related('owner')
def get_owners_from_file_name(file_name):
# The returned vaule will list of owners since different users can upload
# same files with the same filename.
return Attachment.objects.filter(file_name=file_name).select_related('owner')
def get_old_unclaimed_attachments(weeks_ago):
delta_weeks_ago = timezone.now() - datetime.timedelta(weeks=weeks_ago)
old_attachments = Attachment.objects.filter(messages=None, create_time__lt=delta_weeks_ago)
return old_attachments
class Subscription(models.Model): class Subscription(models.Model):
user_profile = models.ForeignKey(UserProfile) user_profile = models.ForeignKey(UserProfile)
recipient = models.ForeignKey(Recipient) recipient = models.ForeignKey(Recipient)

View File

@@ -26,7 +26,7 @@ from zerver.lib.test_helpers import (
from zerver.models import ( from zerver.models import (
MAX_MESSAGE_LENGTH, MAX_SUBJECT_LENGTH, MAX_MESSAGE_LENGTH, MAX_SUBJECT_LENGTH,
Client, Message, Realm, Recipient, Stream, Subscription, UserMessage, UserProfile, Client, Message, Realm, Recipient, Stream, Subscription, UserMessage, UserProfile, Attachment,
get_display_recipient, get_recipient, get_realm, get_stream, get_user_profile_by_email, get_display_recipient, get_recipient, get_realm, get_stream, get_user_profile_by_email,
) )
@@ -36,6 +36,8 @@ from zerver.lib.actions import (
do_add_subscription, do_create_user, do_add_subscription, do_create_user,
) )
from zerver.lib.upload import create_attachment
import datetime import datetime
import time import time
import re import re
@@ -1407,7 +1409,7 @@ class StarTests(AuthedTestCase):
self.assertEqual(sent_message.message.content, content) self.assertEqual(sent_message.message.content, content)
self.assertFalse(sent_message.flags.starred) self.assertFalse(sent_message.flags.starred)
class AttachmentTest(TestCase): class AttachmentTest(AuthedTestCase):
def test_basics(self): def test_basics(self):
self.assertFalse(Message.content_has_attachment('whatever')) self.assertFalse(Message.content_has_attachment('whatever'))
self.assertFalse(Message.content_has_attachment('yo http://foo.com')) self.assertFalse(Message.content_has_attachment('yo http://foo.com'))
@@ -1432,6 +1434,32 @@ class AttachmentTest(TestCase):
self.assertTrue(Message.content_has_link('https://humbug-user-uploads.s3.amazonaws.com/sX_TIQx/screen-shot.jpg')) self.assertTrue(Message.content_has_link('https://humbug-user-uploads.s3.amazonaws.com/sX_TIQx/screen-shot.jpg'))
self.assertTrue(Message.content_has_link('https://humbug-user-uploads-test.s3.amazonaws.com/sX_TIQx/screen-shot.jpg')) self.assertTrue(Message.content_has_link('https://humbug-user-uploads-test.s3.amazonaws.com/sX_TIQx/screen-shot.jpg'))
def test_claim_attachment(self):
# Create dummy DB entry
sender_email = "hamlet@zulip.com"
user_profile = get_user_profile_by_email(sender_email)
dummy_files = [
('zulip.txt', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt'),
('temp_file.py', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/temp_file.py'),
('abc.py', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/abc.py')
]
for file_name, path_id in dummy_files:
create_attachment(file_name, path_id, user_profile)
# Send message referring the attachment
self.subscribe_to_stream(sender_email, "Denmark")
body = "Some files here ...[zulip.txt](http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt)" + \
"http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/temp_file.py.... Some more...." + \
"http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/abc.py"
self.send_message(sender_email, "Denmark", Recipient.STREAM, body, "test")
for file_name, path_id in dummy_files:
attachment = Attachment.objects.get(path_id=path_id)
self.assertTrue(attachment.is_claimed())
class CheckMessageTest(AuthedTestCase): class CheckMessageTest(AuthedTestCase):
def test_basic_check_message_call(self): def test_basic_check_message_call(self):
sender = get_user_profile_by_email('othello@zulip.com') sender = get_user_profile_by_email('othello@zulip.com')

View File

@@ -6,7 +6,11 @@ from unittest import skip
from zerver.lib.test_helpers import AuthedTestCase from zerver.lib.test_helpers import AuthedTestCase
from zerver.lib.test_runner import slow from zerver.lib.test_runner import slow
from zerver.lib.upload import sanitize_name from zerver.lib.upload import sanitize_name, delete_message_image, delete_message_image_local, \
delete_message_image_s3, upload_message_image_s3, upload_message_image_local
from zerver.models import Attachment, Recipient, get_user_profile_by_email, \
get_old_unclaimed_attachments
from zerver.lib.actions import do_delete_old_unclaimed_attachments
import ujson import ujson
from six.moves import urllib from six.moves import urllib
@@ -16,6 +20,12 @@ from boto.s3.key import Key
from six.moves import StringIO from six.moves import StringIO
import os import os
import shutil import shutil
import re
import datetime
from datetime import timedelta
from django.utils import timezone
from moto import mock_s3
TEST_AVATAR_DIR = os.path.join(os.path.dirname(__file__), 'images') TEST_AVATAR_DIR = os.path.join(os.path.dirname(__file__), 'images')
@@ -46,9 +56,13 @@ class FileUploadTest(AuthedTestCase):
result = self.client.post("/json/upload_file") result = self.client.post("/json/upload_file")
self.assert_json_error(result, "You must specify a file to upload") self.assert_json_error(result, "You must specify a file to upload")
# This test will go through the code path for uploading files onto LOCAL storage
# when zulip is in DEVELOPMENT mode.
def test_file_upload_authed(self): def test_file_upload_authed(self):
""" """
A call to /json/upload_file should return a uri and actually create an object. A call to /json/upload_file should return a uri and actually create an
entry in the database. This entry will be marked unclaimed till a message
refers it.
""" """
self.login("hamlet@zulip.com") self.login("hamlet@zulip.com")
fp = StringIO("zulip!") fp = StringIO("zulip!")
@@ -62,10 +76,72 @@ class FileUploadTest(AuthedTestCase):
base = '/user_uploads/' base = '/user_uploads/'
self.assertEquals(base, uri[:len(base)]) self.assertEquals(base, uri[:len(base)])
# In the future, local file requests will follow the same style as S3
# requests; they will be first authenthicated and redirected
response = self.client.get(uri) response = self.client.get(uri)
data = "".join(response.streaming_content) data = "".join(response.streaming_content)
self.assertEquals("zulip!", data) self.assertEquals("zulip!", data)
# check if DB has attachment marked as unclaimed
entry = Attachment.objects.get(file_name='zulip.txt')
self.assertEquals(entry.is_claimed(), False)
def test_delete_old_unclaimed_attachments(self):
# Upload some files and make them older than a weeek
self.login("hamlet@zulip.com")
d1 = StringIO("zulip!")
d1.name = "dummy_1.txt"
result = self.client.post("/json/upload_file", {'file': d1})
json = ujson.loads(result.content)
uri = json["uri"]
d1_path_id = re.sub('/user_uploads/', '', uri)
d2 = StringIO("zulip!")
d2.name = "dummy_2.txt"
result = self.client.post("/json/upload_file", {'file': d2})
json = ujson.loads(result.content)
uri = json["uri"]
d2_path_id = re.sub('/user_uploads/', '', uri)
two_week_ago = timezone.now() - datetime.timedelta(weeks=2)
d1_attachment = Attachment.objects.get(path_id = d1_path_id)
d1_attachment.create_time = two_week_ago
d1_attachment.save()
d2_attachment = Attachment.objects.get(path_id = d2_path_id)
d2_attachment.create_time = two_week_ago
d2_attachment.save()
# Send message refering only dummy_1
self.subscribe_to_stream("hamlet@zulip.com", "Denmark")
body = "Some files here ...[zulip.txt](http://localhost:9991/user_uploads/" + d1_path_id + ")"
self.send_message("hamlet@zulip.com", "Denmark", Recipient.STREAM, body, "test")
# dummy_2 should not exist in database or the uploads folder
do_delete_old_unclaimed_attachments(2)
self.assertTrue(not Attachment.objects.filter(path_id = d2_path_id).exists())
self.assertTrue(not delete_message_image(d2_path_id))
def test_multiple_claim_attachments(self):
"""
This test tries to claim the same attachment twice. The messages field in
the Attachment model should have both the messages in its entry.
"""
self.login("hamlet@zulip.com")
d1 = StringIO("zulip!")
d1.name = "dummy_1.txt"
result = self.client.post("/json/upload_file", {'file': d1})
json = ujson.loads(result.content)
uri = json["uri"]
d1_path_id = re.sub('/user_uploads/', '', uri)
self.subscribe_to_stream("hamlet@zulip.com", "Denmark")
body = "First message ...[zulip.txt](http://localhost:9991/user_uploads/" + d1_path_id + ")"
self.send_message("hamlet@zulip.com", "Denmark", Recipient.STREAM, body, "test")
body = "Second message ...[zulip.txt](http://localhost:9991/user_uploads/" + d1_path_id + ")"
self.send_message("hamlet@zulip.com", "Denmark", Recipient.STREAM, body, "test")
self.assertEquals(Attachment.objects.get(path_id=d1_path_id).messages.count(), 2)
def tearDown(self): def tearDown(self):
destroy_uploads() destroy_uploads()
@@ -136,6 +212,8 @@ class SetAvatarTest(AuthedTestCase):
def tearDown(self): def tearDown(self):
destroy_uploads() destroy_uploads()
class S3Test(AuthedTestCase): class S3Test(AuthedTestCase):
# full URIs in public bucket # full URIs in public bucket
test_uris = [] # type: List[str] test_uris = [] # type: List[str]