diff --git a/zerver/lib/actions.py b/zerver/lib/actions.py index e7e5c794f9..22171394cd 100644 --- a/zerver/lib/actions.py +++ b/zerver/lib/actions.py @@ -8,7 +8,7 @@ from django.contrib.sessions.models import Session from zerver.lib.cache import flush_user_profile from zerver.lib.context_managers import lockfile from zerver.models import Realm, RealmEmoji, Stream, UserProfile, UserActivity, \ - Subscription, Recipient, Message, UserMessage, valid_stream_name, \ + Subscription, Recipient, Message, Attachment, UserMessage, valid_stream_name, \ DefaultStream, UserPresence, Referral, PushDeviceToken, MAX_SUBJECT_LENGTH, \ MAX_MESSAGE_LENGTH, get_client, get_stream, get_recipient, get_huddle, \ get_user_profile_by_id, PreregistrationUser, get_display_recipient, \ @@ -17,7 +17,8 @@ from zerver.models import Realm, RealmEmoji, Stream, UserProfile, UserActivity, get_user_profile_by_email, get_stream_cache_key, to_dict_cache_key_id, \ UserActivityInterval, get_active_user_dicts_in_realm, get_active_streams, \ realm_filters_for_domain, RealmFilter, receives_offline_notifications, \ - ScheduledJob, realm_filters_for_domain, get_active_bot_dicts_in_realm + ScheduledJob, realm_filters_for_domain, get_active_bot_dicts_in_realm, \ + get_old_unclaimed_attachments from zerver.lib.avatar import get_avatar_url, avatar_url @@ -57,6 +58,7 @@ from zerver.lib.push_notifications import num_push_devices_for_user, \ from zerver.lib.notifications import clear_followup_emails_queue from zerver.lib.narrow import check_supported_events_narrow_filter from zerver.lib.session_user import get_session_user +from zerver.lib.upload import claim_attachment, delete_message_image import DNS import ujson @@ -609,6 +611,11 @@ def do_send_messages(messages): ums.extend(ums_to_create) UserMessage.objects.bulk_create(ums) + # Claim attachments in message + for message in messages: + if Message.content_has_attachment(message['message'].content): + do_claim_attachments(message) + for message in messages: cache_save_message(message['message']) # Render Markdown etc. here and store (automatically) in @@ -2998,3 +3005,26 @@ def do_get_streams(user_profile, include_public=True, include_subscribed=True, streams.sort(key=lambda elt: elt["name"]) return streams + +def do_claim_attachments(message): + atttachment_url_re = re.compile('[/\-]user[\-_]uploads[/\.-].*?(?=[ )]|\Z)') + attachment_url_list = atttachment_url_re.findall(message['message'].content) + + results = [] + for url in attachment_url_list: + path_id = re.sub('[/\-]user[\-_]uploads[/\.-]', '', url) + # Remove any extra '.' after file extension. These are probably added by the user + path_id = re.sub('[.]+$', '', path_id, re.M) + + if path_id is not None: + is_claimed = claim_attachment(path_id, message['message']) + results.append((path_id, is_claimed)) + + return results + +def do_delete_old_unclaimed_attachments(weeks_ago): + old_unclaimed_attachments = get_old_unclaimed_attachments(weeks_ago) + + for attachment in old_unclaimed_attachments: + delete_message_image(attachment.path_id) + attachment.delete() diff --git a/zerver/lib/upload.py b/zerver/lib/upload.py index e9a1d4f31a..2513edfc84 100644 --- a/zerver/lib/upload.py +++ b/zerver/lib/upload.py @@ -14,6 +14,7 @@ from boto.s3.connection import S3Connection from mimetypes import guess_type, guess_extension from zerver.models import get_user_profile_by_id +from zerver.models import Attachment import base64 import os @@ -21,6 +22,7 @@ import re from PIL import Image, ImageOps from six.moves import cStringIO as StringIO import random +import logging # Performance Note: # @@ -133,6 +135,8 @@ def upload_message_image_s3(uploaded_file_name, content_type, file_data, user_pr user_profile, file_data ) + + create_attachment(uploaded_file_name, s3_file_name, user_profile) return url def get_signed_upload_url(path): @@ -147,6 +151,20 @@ def get_realm_for_filename(path): return None return get_user_profile_by_id(key.metadata["user_profile_id"]).realm.id +def delete_message_image_s3(path_id): + conn = S3Connection(settings.S3_KEY, settings.S3_SECRET_KEY) + bucket = get_bucket(conn, settings.S3_AUTH_UPLOADS_BUCKET) + + # check if file exists + key = bucket.get_key(path_id) + if key is not None: + bucket.delete_key(key) + return True + + file_name = path_id.split("/")[-1] + logging.warning("%s does not exist. Its entry in the database will be removed." % (file_name,)) + return False + def upload_avatar_image_s3(user_file, user_profile, email): content_type = guess_type(user_file.name)[0] bucket_name = settings.S3_AVATAR_BUCKET @@ -195,9 +213,20 @@ def upload_message_image_local(uploaded_file_name, content_type, file_data, user ]) write_local_file('files', path, file_data) - + create_attachment(uploaded_file_name, path, user_profile) return '/user_uploads/' + path +def delete_message_image_local(path_id): + file_path = os.path.join(settings.LOCAL_UPLOADS_DIR, 'files', path_id) + if os.path.isfile(file_path): + # This removes the file but the empty folders still remain. + os.remove(file_path) + return True + + file_name = path_id.split("/")[-1] + logging.warning("%s does not exist. Its entry in the database will be removed." % (file_name,)) + return False + def upload_avatar_image_local(user_file, user_profile, email): email_hash = user_avatar_hash(email) @@ -212,9 +241,25 @@ def upload_avatar_image_local(user_file, user_profile, email): if settings.LOCAL_UPLOADS_DIR is not None: upload_message_image = upload_message_image_local upload_avatar_image = upload_avatar_image_local + delete_message_image = delete_message_image_local else: upload_message_image = upload_message_image_s3 upload_avatar_image = upload_avatar_image_s3 + delete_message_image = delete_message_image_s3 + +def claim_attachment(path_id, message): + try: + attachment = Attachment.objects.get(path_id=path_id) + attachment.messages.add(message) + attachment.save() + return True + except Attachment.DoesNotExist: + raise JsonableError("The upload was not successful. Please reupload the file again in a new message.") + return False + +def create_attachment(file_name, path_id, user_profile): + Attachment.objects.create(file_name=file_name, path_id=path_id, owner=user_profile) + return True def upload_message_image_through_web_client(request, user_file, user_profile): uploaded_file_name, content_type = get_file_info(request, user_file) diff --git a/zerver/management/commands/delete_old_unclaimed_attachments.py b/zerver/management/commands/delete_old_unclaimed_attachments.py new file mode 100644 index 0000000000..c88bf2d241 --- /dev/null +++ b/zerver/management/commands/delete_old_unclaimed_attachments.py @@ -0,0 +1,44 @@ +from __future__ import absolute_import +from __future__ import print_function + +from django.core.management.base import BaseCommand + +from zerver.lib.actions import do_delete_old_unclaimed_attachments +from zerver.models import Attachment, get_old_unclaimed_attachments + +class Command(BaseCommand): + help = """Remove unclaimed attachments from storage older than a supplied + numerical value indicating the limit of how old the attachment can be. + One week is taken as the default value.""" + + def add_arguments(self, parser): + parser.add_argument('-w', '--weeks', + dest='delta_weeks', + default=1, + help="Limiting value of how old the file can be.") + + parser.add_argument('-f', '--for-real', + dest='for_real', + action='store_true', + default=False, + help="Actually remove the files from the storage.") + + def handle(self, *args, **options): + + delta_weeks = options['delta_weeks'] + print("Deleting unclaimed attached files older than %s" % (delta_weeks,)) + print("") + + # print the list of files that are going to be removed + old_attachments = get_old_unclaimed_attachments(delta_weeks) + for old_attachment in old_attachments: + print("%s created at %s" % (old_attachment.file_name, old_attachment.create_time)) + + print("") + if not options["for_real"]: + print("This was a dry run. Pass -f to actually delete.") + exit(1) + + do_delete_old_unclaimed_attachments(delta_weeks) + print("") + print("Unclaimed Files deleted.") diff --git a/zerver/migrations/0015_attachment.py b/zerver/migrations/0015_attachment.py new file mode 100644 index 0000000000..f74624c2e7 --- /dev/null +++ b/zerver/migrations/0015_attachment.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import models, migrations +import django.utils.timezone +from django.conf import settings + + +class Migration(migrations.Migration): + + dependencies = [ + ('zerver', '0014_realm_emoji_url_length'), + ] + + operations = [ + migrations.CreateModel( + name='Attachment', + fields=[ + ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), + ('file_name', models.CharField(max_length=100, db_index=True)), + ('path_id', models.TextField(db_index=True)), + ('create_time', models.DateTimeField(default=django.utils.timezone.now, db_index=True)), + ('messages', models.ManyToManyField(to='zerver.Message')), + ('owner', models.ForeignKey(to=settings.AUTH_USER_MODEL)), + ], + ), + ] diff --git a/zerver/models.py b/zerver/models.py index 11754e214e..02138eacf8 100644 --- a/zerver/models.py +++ b/zerver/models.py @@ -30,6 +30,8 @@ import pylibmc import re import ujson import logging +import time +import datetime bugdown = None # type: Any @@ -1066,6 +1068,39 @@ def parse_usermessage_flags(val): mask <<= 1 return flags +class Attachment(models.Model): + MAX_FILENAME_LENGTH = 100 + file_name = models.CharField(max_length=MAX_FILENAME_LENGTH, db_index=True) + # path_id is a storage location agnostic representation of the path of the file. + # If the path of a file is http://localhost:9991/user_uploads/a/b/abc/temp_file.py + # then its path_id will be a/b/abc/temp_file.py. + path_id = models.TextField(db_index=True) + owner = models.ForeignKey(UserProfile) + messages = models.ManyToManyField(Message) + create_time = models.DateTimeField(default=timezone.now, db_index=True) + + def __repr__(self): + return (u"" % (self.file_name)) + + def is_claimed(self): + return self.messages.count() > 0 + + def get_url(self): + return "/user_uploads/%s" % (self.path_id) + +def get_attachments_by_owner_id(uid): + return Attachment.objects.filter(owner=uid).select_related('owner') + +def get_owners_from_file_name(file_name): + # The returned vaule will list of owners since different users can upload + # same files with the same filename. + return Attachment.objects.filter(file_name=file_name).select_related('owner') + +def get_old_unclaimed_attachments(weeks_ago): + delta_weeks_ago = timezone.now() - datetime.timedelta(weeks=weeks_ago) + old_attachments = Attachment.objects.filter(messages=None, create_time__lt=delta_weeks_ago) + return old_attachments + class Subscription(models.Model): user_profile = models.ForeignKey(UserProfile) recipient = models.ForeignKey(Recipient) diff --git a/zerver/tests/test_messages.py b/zerver/tests/test_messages.py index 2c489864f2..31ed216181 100644 --- a/zerver/tests/test_messages.py +++ b/zerver/tests/test_messages.py @@ -26,7 +26,7 @@ from zerver.lib.test_helpers import ( from zerver.models import ( MAX_MESSAGE_LENGTH, MAX_SUBJECT_LENGTH, - Client, Message, Realm, Recipient, Stream, Subscription, UserMessage, UserProfile, + Client, Message, Realm, Recipient, Stream, Subscription, UserMessage, UserProfile, Attachment, get_display_recipient, get_recipient, get_realm, get_stream, get_user_profile_by_email, ) @@ -36,6 +36,8 @@ from zerver.lib.actions import ( do_add_subscription, do_create_user, ) +from zerver.lib.upload import create_attachment + import datetime import time import re @@ -1407,7 +1409,7 @@ class StarTests(AuthedTestCase): self.assertEqual(sent_message.message.content, content) self.assertFalse(sent_message.flags.starred) -class AttachmentTest(TestCase): +class AttachmentTest(AuthedTestCase): def test_basics(self): self.assertFalse(Message.content_has_attachment('whatever')) self.assertFalse(Message.content_has_attachment('yo http://foo.com')) @@ -1432,6 +1434,32 @@ class AttachmentTest(TestCase): self.assertTrue(Message.content_has_link('https://humbug-user-uploads.s3.amazonaws.com/sX_TIQx/screen-shot.jpg')) self.assertTrue(Message.content_has_link('https://humbug-user-uploads-test.s3.amazonaws.com/sX_TIQx/screen-shot.jpg')) + def test_claim_attachment(self): + # Create dummy DB entry + sender_email = "hamlet@zulip.com" + user_profile = get_user_profile_by_email(sender_email) + dummy_files = [ + ('zulip.txt', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt'), + ('temp_file.py', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/temp_file.py'), + ('abc.py', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/abc.py') + ] + + for file_name, path_id in dummy_files: + create_attachment(file_name, path_id, user_profile) + + # Send message referring the attachment + self.subscribe_to_stream(sender_email, "Denmark") + + body = "Some files here ...[zulip.txt](http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt)" + \ + "http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/temp_file.py.... Some more...." + \ + "http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/abc.py" + + self.send_message(sender_email, "Denmark", Recipient.STREAM, body, "test") + + for file_name, path_id in dummy_files: + attachment = Attachment.objects.get(path_id=path_id) + self.assertTrue(attachment.is_claimed()) + class CheckMessageTest(AuthedTestCase): def test_basic_check_message_call(self): sender = get_user_profile_by_email('othello@zulip.com') diff --git a/zerver/tests/test_upload.py b/zerver/tests/test_upload.py index 0c796f0d8b..17831bac2f 100644 --- a/zerver/tests/test_upload.py +++ b/zerver/tests/test_upload.py @@ -6,7 +6,11 @@ from unittest import skip from zerver.lib.test_helpers import AuthedTestCase from zerver.lib.test_runner import slow -from zerver.lib.upload import sanitize_name +from zerver.lib.upload import sanitize_name, delete_message_image, delete_message_image_local, \ + delete_message_image_s3, upload_message_image_s3, upload_message_image_local +from zerver.models import Attachment, Recipient, get_user_profile_by_email, \ + get_old_unclaimed_attachments +from zerver.lib.actions import do_delete_old_unclaimed_attachments import ujson from six.moves import urllib @@ -16,6 +20,12 @@ from boto.s3.key import Key from six.moves import StringIO import os import shutil +import re +import datetime +from datetime import timedelta +from django.utils import timezone + +from moto import mock_s3 TEST_AVATAR_DIR = os.path.join(os.path.dirname(__file__), 'images') @@ -46,9 +56,13 @@ class FileUploadTest(AuthedTestCase): result = self.client.post("/json/upload_file") self.assert_json_error(result, "You must specify a file to upload") + # This test will go through the code path for uploading files onto LOCAL storage + # when zulip is in DEVELOPMENT mode. def test_file_upload_authed(self): """ - A call to /json/upload_file should return a uri and actually create an object. + A call to /json/upload_file should return a uri and actually create an + entry in the database. This entry will be marked unclaimed till a message + refers it. """ self.login("hamlet@zulip.com") fp = StringIO("zulip!") @@ -62,10 +76,72 @@ class FileUploadTest(AuthedTestCase): base = '/user_uploads/' self.assertEquals(base, uri[:len(base)]) + # In the future, local file requests will follow the same style as S3 + # requests; they will be first authenthicated and redirected response = self.client.get(uri) data = "".join(response.streaming_content) self.assertEquals("zulip!", data) + # check if DB has attachment marked as unclaimed + entry = Attachment.objects.get(file_name='zulip.txt') + self.assertEquals(entry.is_claimed(), False) + + def test_delete_old_unclaimed_attachments(self): + # Upload some files and make them older than a weeek + self.login("hamlet@zulip.com") + d1 = StringIO("zulip!") + d1.name = "dummy_1.txt" + result = self.client.post("/json/upload_file", {'file': d1}) + json = ujson.loads(result.content) + uri = json["uri"] + d1_path_id = re.sub('/user_uploads/', '', uri) + + d2 = StringIO("zulip!") + d2.name = "dummy_2.txt" + result = self.client.post("/json/upload_file", {'file': d2}) + json = ujson.loads(result.content) + uri = json["uri"] + d2_path_id = re.sub('/user_uploads/', '', uri) + + two_week_ago = timezone.now() - datetime.timedelta(weeks=2) + d1_attachment = Attachment.objects.get(path_id = d1_path_id) + d1_attachment.create_time = two_week_ago + d1_attachment.save() + d2_attachment = Attachment.objects.get(path_id = d2_path_id) + d2_attachment.create_time = two_week_ago + d2_attachment.save() + + # Send message refering only dummy_1 + self.subscribe_to_stream("hamlet@zulip.com", "Denmark") + body = "Some files here ...[zulip.txt](http://localhost:9991/user_uploads/" + d1_path_id + ")" + self.send_message("hamlet@zulip.com", "Denmark", Recipient.STREAM, body, "test") + + # dummy_2 should not exist in database or the uploads folder + do_delete_old_unclaimed_attachments(2) + self.assertTrue(not Attachment.objects.filter(path_id = d2_path_id).exists()) + self.assertTrue(not delete_message_image(d2_path_id)) + + def test_multiple_claim_attachments(self): + """ + This test tries to claim the same attachment twice. The messages field in + the Attachment model should have both the messages in its entry. + """ + self.login("hamlet@zulip.com") + d1 = StringIO("zulip!") + d1.name = "dummy_1.txt" + result = self.client.post("/json/upload_file", {'file': d1}) + json = ujson.loads(result.content) + uri = json["uri"] + d1_path_id = re.sub('/user_uploads/', '', uri) + + self.subscribe_to_stream("hamlet@zulip.com", "Denmark") + body = "First message ...[zulip.txt](http://localhost:9991/user_uploads/" + d1_path_id + ")" + self.send_message("hamlet@zulip.com", "Denmark", Recipient.STREAM, body, "test") + body = "Second message ...[zulip.txt](http://localhost:9991/user_uploads/" + d1_path_id + ")" + self.send_message("hamlet@zulip.com", "Denmark", Recipient.STREAM, body, "test") + + self.assertEquals(Attachment.objects.get(path_id=d1_path_id).messages.count(), 2) + def tearDown(self): destroy_uploads() @@ -136,6 +212,8 @@ class SetAvatarTest(AuthedTestCase): def tearDown(self): destroy_uploads() + + class S3Test(AuthedTestCase): # full URIs in public bucket test_uris = [] # type: List[str]