From 9a9c6730ff67f00165a63c2cdd92d6429bc219c1 Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Thu, 14 Apr 2022 12:57:20 -0700 Subject: [PATCH] preview: Use cache only as a non-durable cache, not an IPC. The `get_link_embed_data` / `link_embed_data_from_cache` pair as introduced in c93f1d4eda71ddcca5ba05d0bf9fbfc8895829e2 uses the cache as a temporary store inside of the `embed_links` worker; this means that it must be durable storage, or the worker will stall and re-fetch the same links to preview them. Switch to plumbing through the fetched URL embed data as an parameter to the Markdown evaluation which uses them, rather than using the cache as an intermediary. This frees up the cache to be merely a non-durable cache. As a side-effect, this removes get_cache_with_key, and link_embed_data_from_cache which was its only callsite. (cherry picked from commit 351bdfaf789c16055c1e11c8e23408a633fe2e0d) --- zerver/actions/message_send.py | 3 ++ zerver/lib/cache.py | 30 ---------------- zerver/lib/markdown/__init__.py | 51 +++++++++++++++------------ zerver/lib/message.py | 3 ++ zerver/lib/url_preview/preview.py | 9 +---- zerver/tests/test_cache.py | 41 +--------------------- zerver/tests/test_link_embed.py | 58 +++++++++++++------------------ zerver/tests/test_markdown.py | 17 ++++----- zerver/worker/queue_processors.py | 10 ++++-- 9 files changed, 76 insertions(+), 146 deletions(-) diff --git a/zerver/actions/message_send.py b/zerver/actions/message_send.py index 7c93dff988..56c7642118 100644 --- a/zerver/actions/message_send.py +++ b/zerver/actions/message_send.py @@ -61,6 +61,7 @@ from zerver.lib.streams import access_stream_for_send_message, ensure_stream from zerver.lib.string_validation import check_stream_name from zerver.lib.timestamp import timestamp_to_datetime from zerver.lib.topic import filter_by_exact_message_topic +from zerver.lib.url_preview.types import UrlEmbedData from zerver.lib.user_message import UserMessageLite, bulk_insert_ums from zerver.lib.user_mutes import get_muting_users from zerver.lib.validator import check_widget_content @@ -124,6 +125,7 @@ def render_incoming_message( user_ids: Set[int], realm: Realm, mention_data: Optional[MentionData] = None, + url_embed_data: Optional[Dict[str, Optional[UrlEmbedData]]] = None, email_gateway: bool = False, ) -> MessageRenderingResult: realm_alert_words_automaton = get_alert_word_automaton(realm) @@ -134,6 +136,7 @@ def render_incoming_message( realm=realm, realm_alert_words_automaton=realm_alert_words_automaton, mention_data=mention_data, + url_embed_data=url_embed_data, email_gateway=email_gateway, ) except MarkdownRenderingException: diff --git a/zerver/lib/cache.py b/zerver/lib/cache.py index 136174a585..f26f84fab1 100644 --- a/zerver/lib/cache.py +++ b/zerver/lib/cache.py @@ -128,36 +128,6 @@ def get_cache_backend(cache_name: Optional[str]) -> BaseCache: return caches[cache_name] -def get_cache_with_key( - keyfunc: Callable[..., str], - cache_name: Optional[str] = None, -) -> Callable[[FuncT], FuncT]: - """ - The main goal of this function getting value from the cache like in the "cache_with_key". - A cache value can contain any data including the "None", so - here used exception for case if value isn't found in the cache. - """ - - def decorator(func: FuncT) -> FuncT: - @wraps(func) - def func_with_caching(*args: object, **kwargs: object) -> object: - key = keyfunc(*args, **kwargs) - try: - val = cache_get(key, cache_name=cache_name) - except InvalidCacheKeyException: - stack_trace = traceback.format_exc() - log_invalid_cache_keys(stack_trace, [key]) - val = None - - if val is not None: - return val[0] - raise NotFoundInCache() - - return cast(FuncT, func_with_caching) # https://github.com/python/mypy/issues/1927 - - return decorator - - def cache_with_key( keyfunc: Callable[..., str], cache_name: Optional[str] = None, diff --git a/zerver/lib/markdown/__init__.py b/zerver/lib/markdown/__init__.py index ab44f4a2a5..e8376fbd17 100644 --- a/zerver/lib/markdown/__init__.py +++ b/zerver/lib/markdown/__init__.py @@ -48,7 +48,7 @@ from tlds import tld_set from typing_extensions import TypedDict from zerver.lib import mention as mention -from zerver.lib.cache import NotFoundInCache, cache_with_key +from zerver.lib.cache import cache_with_key from zerver.lib.camo import get_camo_url from zerver.lib.emoji import EMOTICON_RE, codepoint_to_name, name_to_codepoint, translate_emoticons from zerver.lib.exceptions import MarkdownRenderingException @@ -63,7 +63,6 @@ from zerver.lib.timeout import TimeoutExpired, timeout from zerver.lib.timezone import common_timezones from zerver.lib.types import LinkifierDict from zerver.lib.url_encoding import encode_stream, hash_util_encode -from zerver.lib.url_preview import preview as link_preview from zerver.lib.url_preview.types import UrlEmbedData, UrlOEmbedData from zerver.models import EmojiInfo, Message, Realm, linkifiers_for_realm @@ -1333,29 +1332,32 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor): if not self.md.url_embed_preview_enabled: continue - try: - extracted_data = link_preview.link_embed_data_from_cache(url) - except NotFoundInCache: + if self.md.url_embed_data is None or url not in self.md.url_embed_data: self.md.zulip_rendering_result.links_for_preview.add(url) continue - if extracted_data: - if youtube is not None: - title = self.youtube_title(extracted_data) - if title is not None: - if url == text: - found_url.family.child.text = title - else: - found_url.family.child.text = text - continue - self.add_embed(root, url, extracted_data) - if self.vimeo_id(url): - title = self.vimeo_title(extracted_data) - if title: - if url == text: - found_url.family.child.text = title - else: - found_url.family.child.text = text + # Existing but being None means that we did process the + # URL, but it was not valid to preview. + extracted_data = self.md.url_embed_data[url] + if extracted_data is None: + continue + + if youtube is not None: + title = self.youtube_title(extracted_data) + if title is not None: + if url == text: + found_url.family.child.text = title + else: + found_url.family.child.text = text + continue + self.add_embed(root, url, extracted_data) + if self.vimeo_id(url): + title = self.vimeo_title(extracted_data) + if title: + if url == text: + found_url.family.child.text = title + else: + found_url.family.child.text = text class CompiledInlineProcessor(markdown.inlinepatterns.InlineProcessor): @@ -2110,6 +2112,7 @@ class Markdown(markdown.Markdown): zulip_rendering_result: Optional[MessageRenderingResult] image_preview_enabled: bool url_embed_preview_enabled: bool + url_embed_data: Optional[Dict[str, Optional[UrlEmbedData]]] def __init__( self, @@ -2448,6 +2451,7 @@ def do_convert( message_realm: Optional[Realm] = None, sent_by_bot: bool = False, translate_emoticons: bool = False, + url_embed_data: Optional[Dict[str, Optional[UrlEmbedData]]] = None, mention_data: Optional[MentionData] = None, email_gateway: bool = False, no_previews: bool = False, @@ -2503,6 +2507,7 @@ def do_convert( _md_engine.url_embed_preview_enabled = url_embed_preview_enabled( message, message_realm, no_previews ) + _md_engine.url_embed_data = url_embed_data # Pre-fetch data from the DB that is used in the Markdown thread if message_realm is not None: @@ -2606,6 +2611,7 @@ def markdown_convert( message_realm: Optional[Realm] = None, sent_by_bot: bool = False, translate_emoticons: bool = False, + url_embed_data: Optional[Dict[str, Optional[UrlEmbedData]]] = None, mention_data: Optional[MentionData] = None, email_gateway: bool = False, no_previews: bool = False, @@ -2618,6 +2624,7 @@ def markdown_convert( message_realm, sent_by_bot, translate_emoticons, + url_embed_data, mention_data, email_gateway, no_previews=no_previews, diff --git a/zerver/lib/message.py b/zerver/lib/message.py index 163ed3a356..770c0163e7 100644 --- a/zerver/lib/message.py +++ b/zerver/lib/message.py @@ -39,6 +39,7 @@ from zerver.lib.streams import get_web_public_streams_queryset from zerver.lib.timestamp import datetime_to_timestamp from zerver.lib.topic import DB_TOPIC_NAME, MESSAGE__TOPIC, TOPIC_LINKS, TOPIC_NAME from zerver.lib.types import DisplayRecipientT, EditHistoryEvent, UserDisplayRecipient +from zerver.lib.url_preview.types import UrlEmbedData from zerver.lib.user_topics import build_topic_mute_checker, topic_is_muted from zerver.models import ( MAX_TOPIC_NAME_LENGTH, @@ -899,6 +900,7 @@ def render_markdown( content: str, realm: Optional[Realm] = None, realm_alert_words_automaton: Optional[ahocorasick.Automaton] = None, + url_embed_data: Optional[Dict[str, Optional[UrlEmbedData]]] = None, mention_data: Optional[MentionData] = None, email_gateway: bool = False, ) -> MessageRenderingResult: @@ -920,6 +922,7 @@ def render_markdown( message_realm=realm, sent_by_bot=sent_by_bot, translate_emoticons=translate_emoticons, + url_embed_data=url_embed_data, mention_data=mention_data, email_gateway=email_gateway, ) diff --git a/zerver/lib/url_preview/preview.py b/zerver/lib/url_preview/preview.py index 36a6bcdfed..5bf760277b 100644 --- a/zerver/lib/url_preview/preview.py +++ b/zerver/lib/url_preview/preview.py @@ -8,7 +8,7 @@ from django.conf import settings from django.utils.encoding import smart_str from version import ZULIP_VERSION -from zerver.lib.cache import cache_with_key, get_cache_with_key, preview_url_cache_key +from zerver.lib.cache import cache_with_key, preview_url_cache_key from zerver.lib.outgoing_http import OutgoingSession from zerver.lib.pysa import mark_sanitized from zerver.lib.url_preview.oembed import get_oembed_data @@ -117,10 +117,3 @@ def get_link_embed_data( if data.image: data.image = urljoin(response.url, data.image) return data - - -@get_cache_with_key(preview_url_cache_key, cache_name=CACHE_NAME) -def link_embed_data_from_cache( - url: str, maxwidth: int = 640, maxheight: int = 480 -) -> Optional[UrlEmbedData]: - return None diff --git a/zerver/tests/test_cache.py b/zerver/tests/test_cache.py index 955b60ebb7..145d4a4598 100644 --- a/zerver/tests/test_cache.py +++ b/zerver/tests/test_cache.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional +from typing import Dict, List, Optional from unittest.mock import Mock, patch from django.conf import settings @@ -7,7 +7,6 @@ from zerver.apps import flush_cache from zerver.lib.cache import ( MEMCACHED_MAX_KEY_LENGTH, InvalidCacheKeyException, - NotFoundInCache, bulk_cached_fetch, cache_delete, cache_delete_many, @@ -16,7 +15,6 @@ from zerver.lib.cache import ( cache_set, cache_set_many, cache_with_key, - get_cache_with_key, safe_cache_get_many, safe_cache_set_many, user_profile_by_id_cache_key, @@ -166,43 +164,6 @@ class CacheWithKeyDecoratorTest(ZulipTestCase): self.assert_length(queries, 0) -class GetCacheWithKeyDecoratorTest(ZulipTestCase): - def test_get_cache_with_good_key(self) -> None: - # Test with a good cache key function, but a get_user function - # that always returns None just to make it convenient to tell - # whether the cache was used (whatever we put in the cache) or - # we got the result from calling the function (None) - - def good_cache_key_function(user_id: int) -> str: - return f"CacheWithKeyDecoratorTest:good_cache_key:{user_id}" - - @get_cache_with_key(good_cache_key_function) - def get_user_function_with_good_cache_keys(user_id: int) -> Any: # nocoverage - return - - hamlet = self.example_user("hamlet") - with self.assertRaises(NotFoundInCache): - get_user_function_with_good_cache_keys(hamlet.id) - - cache_set(good_cache_key_function(hamlet.id), hamlet) - result = get_user_function_with_good_cache_keys(hamlet.id) - self.assertEqual(result, hamlet) - - def test_get_cache_with_bad_key(self) -> None: - def bad_cache_key_function(user_id: int) -> str: - return f"CacheWithKeyDecoratorTest:invalid_character:ą:{user_id}" - - @get_cache_with_key(bad_cache_key_function) - def get_user_function_with_bad_cache_keys(user_id: int) -> Any: # nocoverage - return - - hamlet = self.example_user("hamlet") - with self.assertLogs(level="WARNING") as m: - with self.assertRaises(NotFoundInCache): - get_user_function_with_bad_cache_keys(hamlet.id) - self.assert_length(m.output, 1) - - class SafeCacheFunctionsTest(ZulipTestCase): def test_safe_cache_functions_with_all_good_keys(self) -> None: items = { diff --git a/zerver/tests/test_link_embed.py b/zerver/tests/test_link_embed.py index d4d1ea767f..b02113a0e5 100644 --- a/zerver/tests/test_link_embed.py +++ b/zerver/tests/test_link_embed.py @@ -10,14 +10,14 @@ from django.utils.html import escape from pyoembed.providers import get_provider from requests.exceptions import ConnectionError -from zerver.lib.cache import NotFoundInCache, cache_set, preview_url_cache_key +from zerver.lib.cache import cache_get, preview_url_cache_key from zerver.lib.camo import get_camo_url from zerver.lib.queue import queue_json_publish from zerver.lib.test_classes import ZulipTestCase from zerver.lib.test_helpers import mock_queue_publish from zerver.lib.url_preview.oembed import get_oembed_data, strip_cdata from zerver.lib.url_preview.parsers import GenericParser, OpenGraphParser -from zerver.lib.url_preview.preview import get_link_embed_data, link_embed_data_from_cache +from zerver.lib.url_preview.preview import get_link_embed_data from zerver.lib.url_preview.types import UrlEmbedData, UrlOEmbedData from zerver.models import Message, Realm, UserProfile from zerver.worker.queue_processors import FetchLinksEmbedData @@ -634,18 +634,6 @@ class PreviewTestCase(ZulipTestCase): self.assertIsNone(get_link_embed_data("com.notvalidlink")) self.assertIsNone(get_link_embed_data("μένει.com.notvalidlink")) - def test_link_embed_data_from_cache(self) -> None: - url = "http://test.org/" - link_embed_data = "test data" - - with self.assertRaises(NotFoundInCache): - link_embed_data_from_cache(url) - - with self.settings(CACHES=TEST_CACHES): - key = preview_url_cache_key(url) - cache_set(key, link_embed_data, "database") - self.assertEqual(link_embed_data, link_embed_data_from_cache(url)) - @responses.activate @override_settings(INLINE_URL_EMBED_PREVIEW=True) def test_link_preview_non_html_data(self) -> None: @@ -665,7 +653,7 @@ class PreviewTestCase(ZulipTestCase): with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES): with self.assertLogs(level="INFO") as info_logs: FetchLinksEmbedData().consume(event) - cached_data = link_embed_data_from_cache(url) + cached_data = cache_get(preview_url_cache_key(url), cache_name="in-memory")[0] self.assertTrue( "INFO:root:Time spent on get_link_embed_data for http://test.org/audio.mp3: " in info_logs.output[0] @@ -699,13 +687,13 @@ class PreviewTestCase(ZulipTestCase): with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES): with self.assertLogs(level="INFO") as info_logs: FetchLinksEmbedData().consume(event) - cached_data = link_embed_data_from_cache(url) + cached_data = cache_get(preview_url_cache_key(url), cache_name="in-memory")[0] self.assertTrue( "INFO:root:Time spent on get_link_embed_data for http://test.org/foo.html: " in info_logs.output[0] ) - assert cached_data + assert cached_data is not None self.assertIsNotNone(cached_data.title) self.assertIsNone(cached_data.image) msg = Message.objects.select_related("sender").get(id=msg_id) @@ -738,13 +726,13 @@ class PreviewTestCase(ZulipTestCase): with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES): with self.assertLogs(level="INFO") as info_logs: FetchLinksEmbedData().consume(event) - cached_data = link_embed_data_from_cache(url) + cached_data = cache_get(preview_url_cache_key(url), cache_name="in-memory")[0] self.assertTrue( "INFO:root:Time spent on get_link_embed_data for http://test.org/foo.html: " in info_logs.output[0] ) - assert cached_data + assert cached_data is not None self.assertIsNotNone(cached_data.title) self.assertIsNone(cached_data.image) msg = Message.objects.select_related("sender").get(id=msg_id) @@ -775,13 +763,13 @@ class PreviewTestCase(ZulipTestCase): with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES): with self.assertLogs(level="INFO") as info_logs: FetchLinksEmbedData().consume(event) - cached_data = link_embed_data_from_cache(url) + cached_data = cache_get(preview_url_cache_key(url), cache_name="in-memory")[0] self.assertTrue( "INFO:root:Time spent on get_link_embed_data for http://test.org/foo.html: " in info_logs.output[0] ) - assert cached_data + assert cached_data is not None self.assertIsNotNone(cached_data.title) self.assertIsNone(cached_data.image) msg = Message.objects.select_related("sender").get(id=msg_id) @@ -808,17 +796,17 @@ class PreviewTestCase(ZulipTestCase): with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES): with self.assertLogs(level="INFO") as info_logs: FetchLinksEmbedData().consume(event) - data = link_embed_data_from_cache(url) + cached_data = cache_get(preview_url_cache_key(url), cache_name="in-memory")[0] self.assertTrue( "INFO:root:Time spent on get_link_embed_data for http://test.org/: " in info_logs.output[0] ) - assert data is not None + assert cached_data is not None msg = Message.objects.select_related("sender").get(id=msg_id) - self.assertIn(data.title, msg.rendered_content) - assert data.image is not None - self.assertIn(re.sub(r"([^\w-])", r"\\\1", data.image), msg.rendered_content) + self.assertIn(cached_data.title, msg.rendered_content) + assert cached_data.image is not None + self.assertIn(re.sub(r"([^\w-])", r"\\\1", cached_data.image), msg.rendered_content) @responses.activate @override_settings(INLINE_URL_EMBED_PREVIEW=True) @@ -855,8 +843,9 @@ class PreviewTestCase(ZulipTestCase): in info_logs.output[0] ) - with self.assertRaises(NotFoundInCache): - link_embed_data_from_cache(url) + # This did not get cached -- hence the lack of [0] on the cache_get + cached_data = cache_get(preview_url_cache_key(url), cache_name="in-memory") + self.assertIsNone(cached_data) msg.refresh_from_db() self.assertEqual( @@ -890,9 +879,10 @@ class PreviewTestCase(ZulipTestCase): "INFO:root:Time spent on get_link_embed_data for http://test.org/x: " in info_logs.output[0] ) - cached_data = link_embed_data_from_cache(error_url) - # FIXME: Should we really cache this, especially without cache invalidation? + # FIXME: Should we really cache this, especially without cache invalidation? + cached_data = cache_get(preview_url_cache_key(error_url), cache_name="in-memory")[0] + self.assertIsNone(cached_data) msg.refresh_from_db() self.assertEqual( @@ -931,13 +921,13 @@ class PreviewTestCase(ZulipTestCase): lambda *args, **kwargs: mocked_data, ): FetchLinksEmbedData().consume(event) - data = link_embed_data_from_cache(url) + cached_data = cache_get(preview_url_cache_key(url), cache_name="in-memory")[0] self.assertTrue( "INFO:root:Time spent on get_link_embed_data for http://test.org/: " in info_logs.output[0] ) - self.assertEqual(data, mocked_data) + self.assertEqual(cached_data, mocked_data) msg.refresh_from_db() self.assertIn('a data-id="{}"'.format(escape(mocked_data.html)), msg.rendered_content) @@ -966,7 +956,7 @@ class PreviewTestCase(ZulipTestCase): with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES): with self.assertLogs(level="INFO") as info_logs: with mock.patch( - "zerver.lib.markdown.link_preview.link_embed_data_from_cache", + "zerver.worker.queue_processors.url_preview.get_link_embed_data", lambda *args, **kwargs: mocked_data, ): FetchLinksEmbedData().consume(event) @@ -1004,7 +994,7 @@ class PreviewTestCase(ZulipTestCase): with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES): with self.assertLogs(level="INFO") as info_logs: with mock.patch( - "zerver.lib.markdown.link_preview.link_embed_data_from_cache", + "zerver.worker.queue_processors.url_preview.get_link_embed_data", lambda *args, **kwargs: mocked_data, ): FetchLinksEmbedData().consume(event) diff --git a/zerver/tests/test_markdown.py b/zerver/tests/test_markdown.py index 57e954b0a6..0d172d1d2e 100644 --- a/zerver/tests/test_markdown.py +++ b/zerver/tests/test_markdown.py @@ -469,16 +469,13 @@ class MarkdownTest(ZulipTestCase): href = "http://" + url return payload % (f'{url}',) - with mock.patch( - "zerver.lib.url_preview.preview.link_embed_data_from_cache", return_value=None - ): - for inline_url, reference, url in linkify_tests: - try: - match = replaced(reference, url, phrase=inline_url) - except TypeError: - match = reference - converted = markdown_convert_wrapper(inline_url) - self.assertEqual(match, converted) + for inline_url, reference, url in linkify_tests: + try: + match = replaced(reference, url, phrase=inline_url) + except TypeError: + match = reference + converted = markdown_convert_wrapper(inline_url) + self.assertEqual(match, converted) def test_inline_file(self) -> None: msg = "Check out this file file:///Volumes/myserver/Users/Shared/pi.py" diff --git a/zerver/worker/queue_processors.py b/zerver/worker/queue_processors.py index b8e4018138..836851b267 100644 --- a/zerver/worker/queue_processors.py +++ b/zerver/worker/queue_processors.py @@ -86,6 +86,7 @@ from zerver.lib.send_email import ( from zerver.lib.timestamp import timestamp_to_datetime from zerver.lib.upload import handle_reupload_emojis_event from zerver.lib.url_preview import preview as url_preview +from zerver.lib.url_preview.types import UrlEmbedData from zerver.models import ( Message, PreregistrationUser, @@ -856,9 +857,10 @@ class FetchLinksEmbedData(QueueProcessingWorker): CONSUME_ITERATIONS_BEFORE_UPDATE_STATS_NUM = 1 def consume(self, event: Mapping[str, Any]) -> None: + url_embed_data: Dict[str, Optional[UrlEmbedData]] = {} for url in event["urls"]: start_time = time.time() - url_preview.get_link_embed_data(url) + url_embed_data[url] = url_preview.get_link_embed_data(url) logging.info( "Time spent on get_link_embed_data for %s: %s", url, time.time() - start_time ) @@ -879,7 +881,11 @@ class FetchLinksEmbedData(QueueProcessingWorker): # If rendering fails, the called code will raise a JsonableError. rendering_result = render_incoming_message( - message, message.content, message_user_ids, realm + message, + message.content, + message_user_ids, + realm, + url_embed_data=url_embed_data, ) do_update_embedded_data(message.sender, message, message.content, rendering_result)