Add Attachment model to keep track of uploads.

This commit adds the capability to keep track and remove uploaded
files.  Unclaimed attachments are files that have been uploaded to the
server but are not referred in any messages.  A management command to
remove old unclaimed files after a week is also included.

Tests for getting the file referred in messages are also included.
This commit is contained in:
rahuldeve
2016-03-25 00:54:01 +05:30
committed by Tim Abbott
parent 391a225595
commit dde832b158
7 changed files with 294 additions and 7 deletions

View File

@@ -8,7 +8,7 @@ from django.contrib.sessions.models import Session
from zerver.lib.cache import flush_user_profile
from zerver.lib.context_managers import lockfile
from zerver.models import Realm, RealmEmoji, Stream, UserProfile, UserActivity, \
Subscription, Recipient, Message, UserMessage, valid_stream_name, \
Subscription, Recipient, Message, Attachment, UserMessage, valid_stream_name, \
DefaultStream, UserPresence, Referral, PushDeviceToken, MAX_SUBJECT_LENGTH, \
MAX_MESSAGE_LENGTH, get_client, get_stream, get_recipient, get_huddle, \
get_user_profile_by_id, PreregistrationUser, get_display_recipient, \
@@ -17,7 +17,8 @@ from zerver.models import Realm, RealmEmoji, Stream, UserProfile, UserActivity,
get_user_profile_by_email, get_stream_cache_key, to_dict_cache_key_id, \
UserActivityInterval, get_active_user_dicts_in_realm, get_active_streams, \
realm_filters_for_domain, RealmFilter, receives_offline_notifications, \
ScheduledJob, realm_filters_for_domain, get_active_bot_dicts_in_realm
ScheduledJob, realm_filters_for_domain, get_active_bot_dicts_in_realm, \
get_old_unclaimed_attachments
from zerver.lib.avatar import get_avatar_url, avatar_url
@@ -57,6 +58,7 @@ from zerver.lib.push_notifications import num_push_devices_for_user, \
from zerver.lib.notifications import clear_followup_emails_queue
from zerver.lib.narrow import check_supported_events_narrow_filter
from zerver.lib.session_user import get_session_user
from zerver.lib.upload import claim_attachment, delete_message_image
import DNS
import ujson
@@ -609,6 +611,11 @@ def do_send_messages(messages):
ums.extend(ums_to_create)
UserMessage.objects.bulk_create(ums)
# Claim attachments in message
for message in messages:
if Message.content_has_attachment(message['message'].content):
do_claim_attachments(message)
for message in messages:
cache_save_message(message['message'])
# Render Markdown etc. here and store (automatically) in
@@ -2998,3 +3005,26 @@ def do_get_streams(user_profile, include_public=True, include_subscribed=True,
streams.sort(key=lambda elt: elt["name"])
return streams
def do_claim_attachments(message):
atttachment_url_re = re.compile('[/\-]user[\-_]uploads[/\.-].*?(?=[ )]|\Z)')
attachment_url_list = atttachment_url_re.findall(message['message'].content)
results = []
for url in attachment_url_list:
path_id = re.sub('[/\-]user[\-_]uploads[/\.-]', '', url)
# Remove any extra '.' after file extension. These are probably added by the user
path_id = re.sub('[.]+$', '', path_id, re.M)
if path_id is not None:
is_claimed = claim_attachment(path_id, message['message'])
results.append((path_id, is_claimed))
return results
def do_delete_old_unclaimed_attachments(weeks_ago):
old_unclaimed_attachments = get_old_unclaimed_attachments(weeks_ago)
for attachment in old_unclaimed_attachments:
delete_message_image(attachment.path_id)
attachment.delete()

View File

@@ -14,6 +14,7 @@ from boto.s3.connection import S3Connection
from mimetypes import guess_type, guess_extension
from zerver.models import get_user_profile_by_id
from zerver.models import Attachment
import base64
import os
@@ -21,6 +22,7 @@ import re
from PIL import Image, ImageOps
from six.moves import cStringIO as StringIO
import random
import logging
# Performance Note:
#
@@ -133,6 +135,8 @@ def upload_message_image_s3(uploaded_file_name, content_type, file_data, user_pr
user_profile,
file_data
)
create_attachment(uploaded_file_name, s3_file_name, user_profile)
return url
def get_signed_upload_url(path):
@@ -147,6 +151,20 @@ def get_realm_for_filename(path):
return None
return get_user_profile_by_id(key.metadata["user_profile_id"]).realm.id
def delete_message_image_s3(path_id):
conn = S3Connection(settings.S3_KEY, settings.S3_SECRET_KEY)
bucket = get_bucket(conn, settings.S3_AUTH_UPLOADS_BUCKET)
# check if file exists
key = bucket.get_key(path_id)
if key is not None:
bucket.delete_key(key)
return True
file_name = path_id.split("/")[-1]
logging.warning("%s does not exist. Its entry in the database will be removed." % (file_name,))
return False
def upload_avatar_image_s3(user_file, user_profile, email):
content_type = guess_type(user_file.name)[0]
bucket_name = settings.S3_AVATAR_BUCKET
@@ -195,9 +213,20 @@ def upload_message_image_local(uploaded_file_name, content_type, file_data, user
])
write_local_file('files', path, file_data)
create_attachment(uploaded_file_name, path, user_profile)
return '/user_uploads/' + path
def delete_message_image_local(path_id):
file_path = os.path.join(settings.LOCAL_UPLOADS_DIR, 'files', path_id)
if os.path.isfile(file_path):
# This removes the file but the empty folders still remain.
os.remove(file_path)
return True
file_name = path_id.split("/")[-1]
logging.warning("%s does not exist. Its entry in the database will be removed." % (file_name,))
return False
def upload_avatar_image_local(user_file, user_profile, email):
email_hash = user_avatar_hash(email)
@@ -212,9 +241,25 @@ def upload_avatar_image_local(user_file, user_profile, email):
if settings.LOCAL_UPLOADS_DIR is not None:
upload_message_image = upload_message_image_local
upload_avatar_image = upload_avatar_image_local
delete_message_image = delete_message_image_local
else:
upload_message_image = upload_message_image_s3
upload_avatar_image = upload_avatar_image_s3
delete_message_image = delete_message_image_s3
def claim_attachment(path_id, message):
try:
attachment = Attachment.objects.get(path_id=path_id)
attachment.messages.add(message)
attachment.save()
return True
except Attachment.DoesNotExist:
raise JsonableError("The upload was not successful. Please reupload the file again in a new message.")
return False
def create_attachment(file_name, path_id, user_profile):
Attachment.objects.create(file_name=file_name, path_id=path_id, owner=user_profile)
return True
def upload_message_image_through_web_client(request, user_file, user_profile):
uploaded_file_name, content_type = get_file_info(request, user_file)

View File

@@ -0,0 +1,44 @@
from __future__ import absolute_import
from __future__ import print_function
from django.core.management.base import BaseCommand
from zerver.lib.actions import do_delete_old_unclaimed_attachments
from zerver.models import Attachment, get_old_unclaimed_attachments
class Command(BaseCommand):
help = """Remove unclaimed attachments from storage older than a supplied
numerical value indicating the limit of how old the attachment can be.
One week is taken as the default value."""
def add_arguments(self, parser):
parser.add_argument('-w', '--weeks',
dest='delta_weeks',
default=1,
help="Limiting value of how old the file can be.")
parser.add_argument('-f', '--for-real',
dest='for_real',
action='store_true',
default=False,
help="Actually remove the files from the storage.")
def handle(self, *args, **options):
delta_weeks = options['delta_weeks']
print("Deleting unclaimed attached files older than %s" % (delta_weeks,))
print("")
# print the list of files that are going to be removed
old_attachments = get_old_unclaimed_attachments(delta_weeks)
for old_attachment in old_attachments:
print("%s created at %s" % (old_attachment.file_name, old_attachment.create_time))
print("")
if not options["for_real"]:
print("This was a dry run. Pass -f to actually delete.")
exit(1)
do_delete_old_unclaimed_attachments(delta_weeks)
print("")
print("Unclaimed Files deleted.")

View File

@@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import models, migrations
import django.utils.timezone
from django.conf import settings
class Migration(migrations.Migration):
dependencies = [
('zerver', '0014_realm_emoji_url_length'),
]
operations = [
migrations.CreateModel(
name='Attachment',
fields=[
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
('file_name', models.CharField(max_length=100, db_index=True)),
('path_id', models.TextField(db_index=True)),
('create_time', models.DateTimeField(default=django.utils.timezone.now, db_index=True)),
('messages', models.ManyToManyField(to='zerver.Message')),
('owner', models.ForeignKey(to=settings.AUTH_USER_MODEL)),
],
),
]

View File

@@ -30,6 +30,8 @@ import pylibmc
import re
import ujson
import logging
import time
import datetime
bugdown = None # type: Any
@@ -1066,6 +1068,39 @@ def parse_usermessage_flags(val):
mask <<= 1
return flags
class Attachment(models.Model):
MAX_FILENAME_LENGTH = 100
file_name = models.CharField(max_length=MAX_FILENAME_LENGTH, db_index=True)
# path_id is a storage location agnostic representation of the path of the file.
# If the path of a file is http://localhost:9991/user_uploads/a/b/abc/temp_file.py
# then its path_id will be a/b/abc/temp_file.py.
path_id = models.TextField(db_index=True)
owner = models.ForeignKey(UserProfile)
messages = models.ManyToManyField(Message)
create_time = models.DateTimeField(default=timezone.now, db_index=True)
def __repr__(self):
return (u"<Attachment: %s>" % (self.file_name))
def is_claimed(self):
return self.messages.count() > 0
def get_url(self):
return "/user_uploads/%s" % (self.path_id)
def get_attachments_by_owner_id(uid):
return Attachment.objects.filter(owner=uid).select_related('owner')
def get_owners_from_file_name(file_name):
# The returned vaule will list of owners since different users can upload
# same files with the same filename.
return Attachment.objects.filter(file_name=file_name).select_related('owner')
def get_old_unclaimed_attachments(weeks_ago):
delta_weeks_ago = timezone.now() - datetime.timedelta(weeks=weeks_ago)
old_attachments = Attachment.objects.filter(messages=None, create_time__lt=delta_weeks_ago)
return old_attachments
class Subscription(models.Model):
user_profile = models.ForeignKey(UserProfile)
recipient = models.ForeignKey(Recipient)

View File

@@ -26,7 +26,7 @@ from zerver.lib.test_helpers import (
from zerver.models import (
MAX_MESSAGE_LENGTH, MAX_SUBJECT_LENGTH,
Client, Message, Realm, Recipient, Stream, Subscription, UserMessage, UserProfile,
Client, Message, Realm, Recipient, Stream, Subscription, UserMessage, UserProfile, Attachment,
get_display_recipient, get_recipient, get_realm, get_stream, get_user_profile_by_email,
)
@@ -36,6 +36,8 @@ from zerver.lib.actions import (
do_add_subscription, do_create_user,
)
from zerver.lib.upload import create_attachment
import datetime
import time
import re
@@ -1407,7 +1409,7 @@ class StarTests(AuthedTestCase):
self.assertEqual(sent_message.message.content, content)
self.assertFalse(sent_message.flags.starred)
class AttachmentTest(TestCase):
class AttachmentTest(AuthedTestCase):
def test_basics(self):
self.assertFalse(Message.content_has_attachment('whatever'))
self.assertFalse(Message.content_has_attachment('yo http://foo.com'))
@@ -1432,6 +1434,32 @@ class AttachmentTest(TestCase):
self.assertTrue(Message.content_has_link('https://humbug-user-uploads.s3.amazonaws.com/sX_TIQx/screen-shot.jpg'))
self.assertTrue(Message.content_has_link('https://humbug-user-uploads-test.s3.amazonaws.com/sX_TIQx/screen-shot.jpg'))
def test_claim_attachment(self):
# Create dummy DB entry
sender_email = "hamlet@zulip.com"
user_profile = get_user_profile_by_email(sender_email)
dummy_files = [
('zulip.txt', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt'),
('temp_file.py', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/temp_file.py'),
('abc.py', '1/31/4CBjtTLYZhk66pZrF8hnYGwc/abc.py')
]
for file_name, path_id in dummy_files:
create_attachment(file_name, path_id, user_profile)
# Send message referring the attachment
self.subscribe_to_stream(sender_email, "Denmark")
body = "Some files here ...[zulip.txt](http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt)" + \
"http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/temp_file.py.... Some more...." + \
"http://localhost:9991/user_uploads/1/31/4CBjtTLYZhk66pZrF8hnYGwc/abc.py"
self.send_message(sender_email, "Denmark", Recipient.STREAM, body, "test")
for file_name, path_id in dummy_files:
attachment = Attachment.objects.get(path_id=path_id)
self.assertTrue(attachment.is_claimed())
class CheckMessageTest(AuthedTestCase):
def test_basic_check_message_call(self):
sender = get_user_profile_by_email('othello@zulip.com')

View File

@@ -6,7 +6,11 @@ from unittest import skip
from zerver.lib.test_helpers import AuthedTestCase
from zerver.lib.test_runner import slow
from zerver.lib.upload import sanitize_name
from zerver.lib.upload import sanitize_name, delete_message_image, delete_message_image_local, \
delete_message_image_s3, upload_message_image_s3, upload_message_image_local
from zerver.models import Attachment, Recipient, get_user_profile_by_email, \
get_old_unclaimed_attachments
from zerver.lib.actions import do_delete_old_unclaimed_attachments
import ujson
from six.moves import urllib
@@ -16,6 +20,12 @@ from boto.s3.key import Key
from six.moves import StringIO
import os
import shutil
import re
import datetime
from datetime import timedelta
from django.utils import timezone
from moto import mock_s3
TEST_AVATAR_DIR = os.path.join(os.path.dirname(__file__), 'images')
@@ -46,9 +56,13 @@ class FileUploadTest(AuthedTestCase):
result = self.client.post("/json/upload_file")
self.assert_json_error(result, "You must specify a file to upload")
# This test will go through the code path for uploading files onto LOCAL storage
# when zulip is in DEVELOPMENT mode.
def test_file_upload_authed(self):
"""
A call to /json/upload_file should return a uri and actually create an object.
A call to /json/upload_file should return a uri and actually create an
entry in the database. This entry will be marked unclaimed till a message
refers it.
"""
self.login("hamlet@zulip.com")
fp = StringIO("zulip!")
@@ -62,10 +76,72 @@ class FileUploadTest(AuthedTestCase):
base = '/user_uploads/'
self.assertEquals(base, uri[:len(base)])
# In the future, local file requests will follow the same style as S3
# requests; they will be first authenthicated and redirected
response = self.client.get(uri)
data = "".join(response.streaming_content)
self.assertEquals("zulip!", data)
# check if DB has attachment marked as unclaimed
entry = Attachment.objects.get(file_name='zulip.txt')
self.assertEquals(entry.is_claimed(), False)
def test_delete_old_unclaimed_attachments(self):
# Upload some files and make them older than a weeek
self.login("hamlet@zulip.com")
d1 = StringIO("zulip!")
d1.name = "dummy_1.txt"
result = self.client.post("/json/upload_file", {'file': d1})
json = ujson.loads(result.content)
uri = json["uri"]
d1_path_id = re.sub('/user_uploads/', '', uri)
d2 = StringIO("zulip!")
d2.name = "dummy_2.txt"
result = self.client.post("/json/upload_file", {'file': d2})
json = ujson.loads(result.content)
uri = json["uri"]
d2_path_id = re.sub('/user_uploads/', '', uri)
two_week_ago = timezone.now() - datetime.timedelta(weeks=2)
d1_attachment = Attachment.objects.get(path_id = d1_path_id)
d1_attachment.create_time = two_week_ago
d1_attachment.save()
d2_attachment = Attachment.objects.get(path_id = d2_path_id)
d2_attachment.create_time = two_week_ago
d2_attachment.save()
# Send message refering only dummy_1
self.subscribe_to_stream("hamlet@zulip.com", "Denmark")
body = "Some files here ...[zulip.txt](http://localhost:9991/user_uploads/" + d1_path_id + ")"
self.send_message("hamlet@zulip.com", "Denmark", Recipient.STREAM, body, "test")
# dummy_2 should not exist in database or the uploads folder
do_delete_old_unclaimed_attachments(2)
self.assertTrue(not Attachment.objects.filter(path_id = d2_path_id).exists())
self.assertTrue(not delete_message_image(d2_path_id))
def test_multiple_claim_attachments(self):
"""
This test tries to claim the same attachment twice. The messages field in
the Attachment model should have both the messages in its entry.
"""
self.login("hamlet@zulip.com")
d1 = StringIO("zulip!")
d1.name = "dummy_1.txt"
result = self.client.post("/json/upload_file", {'file': d1})
json = ujson.loads(result.content)
uri = json["uri"]
d1_path_id = re.sub('/user_uploads/', '', uri)
self.subscribe_to_stream("hamlet@zulip.com", "Denmark")
body = "First message ...[zulip.txt](http://localhost:9991/user_uploads/" + d1_path_id + ")"
self.send_message("hamlet@zulip.com", "Denmark", Recipient.STREAM, body, "test")
body = "Second message ...[zulip.txt](http://localhost:9991/user_uploads/" + d1_path_id + ")"
self.send_message("hamlet@zulip.com", "Denmark", Recipient.STREAM, body, "test")
self.assertEquals(Attachment.objects.get(path_id=d1_path_id).messages.count(), 2)
def tearDown(self):
destroy_uploads()
@@ -136,6 +212,8 @@ class SetAvatarTest(AuthedTestCase):
def tearDown(self):
destroy_uploads()
class S3Test(AuthedTestCase):
# full URIs in public bucket
test_uris = [] # type: List[str]