messages: Set has_attachment correctly using Bugdown.

Previously, we would naively set has_attachment just by searching
the whole messages for strings like `/user_uploads/...`. We now
prevent running do_claim_attachments for messages that obviously
do not have an attachment in them that we previously ran.

For example: attachments in codeblocks or
             attachments that otherwise do not match our link syntax.

The new implementation runs that check on only the urls that
bugdown determines should be rendered. We also refactor some
Attachment tests in test_messages to test this change.

The new method is:

1. Create a list of potential_attachment_urls in Bugdown while rendering.
2. Loop over this list in do_claim_attachments for the actual claiming.
   For saving:
3. If we claimed an attachment, set message.has_attachment to True.
   For updating:
3. If claimed_attachment != message.has_attachment: update has_attachment.

We do not modify the logic for 'unclaiming' attachments when editing.
This commit is contained in:
Rohitt Vashishtha
2019-12-11 02:49:30 +05:30
committed by Tim Abbott
parent 4674cc5098
commit 3892a8afd8
5 changed files with 128 additions and 43 deletions

View File

@@ -1397,6 +1397,13 @@ def do_send_messages(messages_maybe_none: Sequence[Optional[MutableMapping[str,
user_message_flags = defaultdict(dict) # type: Dict[int, Dict[int, List[str]]]
with transaction.atomic():
Message.objects.bulk_create([message['message'] for message in messages])
# Claim attachments in message
for message in messages:
if do_claim_attachments(message['message']):
message['message'].has_attachment = True
message['message'].save(update_fields=['has_attachment'])
ums = [] # type: List[UserMessageLite]
for message in messages:
# Service bots (outgoing webhook bots and embedded bots) don't store UserMessage rows;
@@ -1427,11 +1434,6 @@ def do_send_messages(messages_maybe_none: Sequence[Optional[MutableMapping[str,
bulk_insert_ums(ums)
# Claim attachments in message
for message in messages:
if Message.content_has_attachment(message['message'].content):
do_claim_attachments(message['message'])
for message in messages:
do_widget_post_save_actions(message)
@@ -4501,7 +4503,7 @@ def do_update_message(user_profile: UserProfile, message: Message, topic_name: O
prev_content = edit_history_event['prev_content']
if Message.content_has_attachment(prev_content) or Message.content_has_attachment(message.content):
check_attachment_reference_change(prev_content, message)
message.has_attachment = check_attachment_reference_change(prev_content, message)
if message.is_stream_message():
if topic_name is not None:
@@ -5451,9 +5453,12 @@ def notify_attachment_update(user_profile: UserProfile, op: str,
}
send_event(user_profile.realm, event, [user_profile.id])
def do_claim_attachments(message: Message) -> None:
attachment_url_list = attachment_url_re.findall(message.content)
def do_claim_attachments(message: Message) -> bool:
attachment_url_list = message.potential_attachment_urls
if not attachment_url_list:
return False
claimed = False
for url in attachment_url_list:
path_id = attachment_url_to_path_id(url)
user_profile = message.sender
@@ -5475,8 +5480,10 @@ def do_claim_attachments(message: Message) -> None:
user_profile.id, path_id, message.id))
continue
claimed = True
attachment = claim_attachment(user_profile, path_id, message, is_message_realm_public)
notify_attachment_update(user_profile, "update", attachment.to_dict())
return claimed
def do_delete_old_unclaimed_attachments(weeks_ago: int) -> None:
old_unclaimed_attachments = get_old_unclaimed_attachments(weeks_ago)
@@ -5485,7 +5492,7 @@ def do_delete_old_unclaimed_attachments(weeks_ago: int) -> None:
delete_message_image(attachment.path_id)
attachment.delete()
def check_attachment_reference_change(prev_content: str, message: Message) -> None:
def check_attachment_reference_change(prev_content: str, message: Message) -> bool:
new_content = message.content
prev_attachments = set(attachment_url_re.findall(prev_content))
new_attachments = set(attachment_url_re.findall(new_content))
@@ -5499,9 +5506,11 @@ def check_attachment_reference_change(prev_content: str, message: Message) -> No
attachments_to_update = Attachment.objects.filter(path_id__in=path_ids).select_for_update()
message.attachment_set.remove(*attachments_to_update)
claimed = False
to_add = list(new_attachments - prev_attachments)
if len(to_add) > 0:
do_claim_attachments(message)
claimed = do_claim_attachments(message)
return claimed
def notify_realm_custom_profile_fields(realm: Realm, operation: str) -> None:
fields = custom_profile_fields_for_realm(realm.id)

View File

@@ -129,6 +129,10 @@ STREAM_TOPIC_LINK_REGEX = r"""
def get_compiled_stream_topic_link_regex() -> Pattern:
return verbose_compile(STREAM_TOPIC_LINK_REGEX)
@one_time
def get_compiled_attachment_regex() -> Pattern:
return verbose_compile(r'(?P<path>[/\-]user[\-_]uploads[/\.-].*)')
LINK_REGEX = None # type: Pattern
def get_web_link_regex() -> str:
@@ -1044,18 +1048,36 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
def is_absolute_url(self, url: str) -> bool:
return bool(urllib.parse.urlparse(url).netloc)
def maybe_append_attachment_url(self, url: str) -> None:
attachment_regex = get_compiled_attachment_regex()
m = attachment_regex.match(url)
if m:
urls = self.markdown.zulip_message.potential_attachment_urls
urls.append(m.group('path'))
self.markdown.zulip_message.potential_attachment_urls = urls
def run(self, root: Element) -> None:
# Get all URLs from the blob
found_urls = walk_tree_with_family(root, self.get_url_data)
unique_urls = {found_url.result[0] for found_url in found_urls}
# Collect unique URLs which are not quoted as we don't do
# inline previews for links inside blockquotes.
unique_previewable_urls = {found_url.result[0] for found_url in found_urls
if not found_url.family.in_blockquote}
# Set has_link and similar flags whenever a message is processed by bugdown
if self.markdown.zulip_message:
self.markdown.zulip_message.has_link = len(found_urls) > 0
self.markdown.zulip_message.has_image = False # This is updated in self.add_a
# Populate potential_attachment_urls. do_claim_attachments() handles setting has_attachments.
self.markdown.zulip_message.potential_attachment_urls = []
for url in unique_urls:
self.maybe_append_attachment_url(url)
# Collect unique URLs which are not quoted
unique_urls = {found_url.result[0] for found_url in found_urls if not found_url.family.in_blockquote}
if len(found_urls) == 0 or len(unique_urls) > self.INLINE_PREVIEW_LIMIT_PER_MESSAGE:
if len(found_urls) == 0:
return
if len(unique_previewable_urls) > self.INLINE_PREVIEW_LIMIT_PER_MESSAGE:
return
processed_urls = set() # type: Set[str]
@@ -1064,7 +1086,7 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
for found_url in found_urls:
(url, text) = found_url.result
if url in unique_urls and url not in processed_urls:
if url in unique_previewable_urls and url not in processed_urls:
processed_urls.add(url)
else:
continue

View File

@@ -1726,6 +1726,14 @@ class Message(AbstractMessage):
'website', 'ios', 'android')) or (
'desktop app' in sending_client)
@property
def potential_attachment_urls(self) -> List[str]:
return getattr(self, '_potential_attachment_urls', [])
@potential_attachment_urls.setter
def potential_attachment_urls(self, urls: List[str]) -> None:
self._potential_attachment_urls = urls
@staticmethod
def content_has_attachment(content: str) -> Match:
return re.search(r'[/\-]user[\-_]uploads[/\.-]', content)
@@ -1742,8 +1750,7 @@ class Message(AbstractMessage):
def update_calculated_fields(self) -> None:
# TODO: rendered_content could also be considered a calculated field
content = self.content
self.has_attachment = bool(Message.content_has_attachment(content))
return
@receiver(pre_save, sender=Message)
def pre_save_message(sender: Any, **kwargs: Any) -> None:

View File

@@ -3570,19 +3570,9 @@ class MessageAccessTests(ZulipTestCase):
self.assertEqual(len(filtered_messages), 2)
class AttachmentTest(ZulipTestCase):
def test_basics(self) -> None:
zulip_realm = get_realm("zulip")
self.assertFalse(Message.content_has_attachment('whatever'))
self.assertFalse(Message.content_has_attachment('yo http://foo.com'))
self.assertTrue(Message.content_has_attachment('yo\n https://staging.zulip.com/user_uploads/'))
self.assertTrue(Message.content_has_attachment('yo\n /user_uploads/%s/wEAnI-PEmVmCjo15xxNaQbnj/photo-10.jpg foo' % (
zulip_realm.id,)))
def test_claim_attachment(self) -> None:
# Create dummy DB entry
user_profile = self.example_user('hamlet')
class MessageHasKeywordsTest(ZulipTestCase):
'''Test for keywords like has_link, has_image, has_attachment.'''
def setup_dummy_attachments(self, user_profile: UserProfile) -> List[str]:
sample_size = 10
realm_id = user_profile.realm_id
dummy_files = [
@@ -3594,21 +3584,54 @@ class AttachmentTest(ZulipTestCase):
for file_name, path_id, size in dummy_files:
create_attachment(file_name, path_id, user_profile, size)
# return path ids
return [x[1] for x in dummy_files]
def test_basics(self) -> None:
zulip_realm = get_realm("zulip")
self.assertFalse(Message.content_has_attachment('whatever'))
self.assertFalse(Message.content_has_attachment('yo http://foo.com'))
self.assertTrue(Message.content_has_attachment('yo\n https://staging.zulip.com/user_uploads/'))
self.assertTrue(Message.content_has_attachment('yo\n /user_uploads/%s/wEAnI-PEmVmCjo15xxNaQbnj/photo-10.jpg foo' % (
zulip_realm.id,)))
def test_claim_attachment(self) -> None:
user_profile = self.example_user('hamlet')
dummy_path_ids = self.setup_dummy_attachments(user_profile)
dummy_urls = ["http://localhost:9991/user_uploads/{}".format(x) for x in dummy_path_ids]
# Send message referring the attachment
self.subscribe(user_profile, "Denmark")
body = ("Some files here ...[zulip.txt](http://localhost:9991/user_uploads/{realm_id}/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt)" +
"http://localhost:9991/user_uploads/{realm_id}/31/4CBjtTLYZhk66pZrF8hnYGwc/temp_file.py.... Some more...." +
"http://localhost:9991/user_uploads/{realm_id}/31/4CBjtTLYZhk66pZrF8hnYGwc/abc.py").format(realm_id=realm_id)
self.send_stream_message(user_profile.email, "Denmark", body, "test")
for file_name, path_id, size in dummy_files:
def assert_attachment_claimed(path_id: str, claimed: bool) -> None:
attachment = Attachment.objects.get(path_id=path_id)
self.assertTrue(attachment.is_claimed())
self.assertEqual(attachment.is_claimed(), claimed)
class MessageHasKeywordsTest(ZulipTestCase):
'''Test for keywords like has_link, has_image, has_attachment'''
# This message should claim attachments 1 only because attachment 2
# is not being parsed as a link by Bugdown.
body = ("Some files here ...[zulip.txt]({})" +
"{}.... Some more...." +
"{}").format(dummy_urls[0], dummy_urls[1], dummy_urls[1])
self.send_stream_message(user_profile.email, "Denmark", body, "test")
assert_attachment_claimed(dummy_path_ids[0], True)
assert_attachment_claimed(dummy_path_ids[1], False)
# This message tries to claim the third attachment but fails because
# Bugdown would not set has_attachments = True here.
body = "Link in code: `{}`".format(dummy_urls[2])
self.send_stream_message(user_profile.email, "Denmark", body, "test")
assert_attachment_claimed(dummy_path_ids[2], False)
# Another scenario where we wouldn't parse the link.
body = "Link to not parse: .{}.`".format(dummy_urls[2])
self.send_stream_message(user_profile.email, "Denmark", body, "test")
assert_attachment_claimed(dummy_path_ids[2], False)
# Finally, claim attachment 3.
body = "Link: {}".format(dummy_urls[2])
self.send_stream_message(user_profile.email, "Denmark", body, "test")
assert_attachment_claimed(dummy_path_ids[2], True)
assert_attachment_claimed(dummy_path_ids[1], False)
def test_finds_all_links(self) -> None:
msg_ids = []
@@ -3645,7 +3668,8 @@ class MessageHasKeywordsTest(ZulipTestCase):
self.assertTrue(msg.has_link)
self.update_message(msg, 'a')
self.assertFalse(msg.has_link)
self.update_message(msg, 'a http://foo.com')
# Check in blockquotes work
self.update_message(msg, '> http://bar.com')
self.assertTrue(msg.has_link)
self.update_message(msg, 'a `http://foo.com`')
self.assertFalse(msg.has_link)
@@ -3667,6 +3691,29 @@ class MessageHasKeywordsTest(ZulipTestCase):
self.update_message(msgs[0], 'No Image Again')
self.assertFalse(msgs[0].has_image)
def test_has_attachment(self) -> None:
# This test could belong in AttachmentTest as well, but
# we keep it with other 'has:' tests.
hamlet = self.example_user('hamlet')
dummy_path_ids = self.setup_dummy_attachments(hamlet)
dummy_urls = ["http://localhost:9991/user_uploads/{}".format(x) for x in dummy_path_ids]
self.subscribe(hamlet, "Denmark")
body = ("Files ...[zulip.txt]({}) {} {}").format(dummy_urls[0], dummy_urls[1], dummy_urls[2])
msg_id = self.send_stream_message(hamlet.email, "Denmark", body, "test")
msg = Message.objects.get(id=msg_id)
self.assertTrue(msg.has_attachment)
self.update_message(msg, 'No Attachments')
self.assertFalse(msg.has_attachment)
self.update_message(msg, body)
self.assertTrue(msg.has_attachment)
self.update_message(msg, 'Link in code: `{}`'.format(dummy_urls[1]))
self.assertFalse(msg.has_attachment)
# Test blockquotes
self.update_message(msg, '> {}'.format(dummy_urls[1]))
self.assertTrue(msg.has_attachment)
class MissedMessageTest(ZulipTestCase):
def test_presence_idle_user_ids(self) -> None:
UserPresence.objects.all().delete()

View File

@@ -143,9 +143,9 @@ class ArchiveMessagesTestingBase(RetentionTestingBase):
create_attachment(file_name, path_id, user_profile, size)
self.subscribe(user_profile, "Denmark")
body = ("Some files here ...[zulip.txt](http://localhost:9991/user_uploads/{id}/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt)" +
"http://localhost:9991/user_uploads/{id}/31/4CBjtTLYZhk66pZrF8hnYGwc/temp_file.py.... Some more...." +
"http://localhost:9991/user_uploads/{id}/31/4CBjtTLYZhk66pZrF8hnYGwc/abc.py").format(id=realm_id)
body = ("Some files here ... [zulip.txt](http://localhost:9991/user_uploads/{id}/31/4CBjtTLYZhk66pZrF8hnYGwc/zulip.txt)" +
" http://localhost:9991/user_uploads/{id}/31/4CBjtTLYZhk66pZrF8hnYGwc/temp_file.py.... Some more...." +
" http://localhost:9991/user_uploads/{id}/31/4CBjtTLYZhk66pZrF8hnYGwc/abc.py").format(id=realm_id)
expired_message_id = self.send_stream_message(sender_email, "Denmark", body)
actual_message_id = self.send_stream_message(sender_email, "Denmark", body)