preview: Use cache only as a non-durable cache, not an IPC.

The `get_link_embed_data` / `link_embed_data_from_cache` pair as
introduced in c93f1d4eda uses the cache
as a temporary store inside of the `embed_links` worker; this means
that it must be durable storage, or the worker will stall and re-fetch
the same links to preview them.

Switch to plumbing through the fetched URL embed data as an parameter
to the Markdown evaluation which uses them, rather than using the
cache as an intermediary.  This frees up the cache to be merely a
non-durable cache.

As a side-effect, this removes get_cache_with_key, and
link_embed_data_from_cache which was its only callsite.

(cherry picked from commit 351bdfaf78)
This commit is contained in:
Alex Vandiver
2022-04-14 12:57:20 -07:00
parent 5ff82c82ae
commit 9a9c6730ff
9 changed files with 76 additions and 146 deletions

View File

@@ -61,6 +61,7 @@ from zerver.lib.streams import access_stream_for_send_message, ensure_stream
from zerver.lib.string_validation import check_stream_name
from zerver.lib.timestamp import timestamp_to_datetime
from zerver.lib.topic import filter_by_exact_message_topic
from zerver.lib.url_preview.types import UrlEmbedData
from zerver.lib.user_message import UserMessageLite, bulk_insert_ums
from zerver.lib.user_mutes import get_muting_users
from zerver.lib.validator import check_widget_content
@@ -124,6 +125,7 @@ def render_incoming_message(
user_ids: Set[int],
realm: Realm,
mention_data: Optional[MentionData] = None,
url_embed_data: Optional[Dict[str, Optional[UrlEmbedData]]] = None,
email_gateway: bool = False,
) -> MessageRenderingResult:
realm_alert_words_automaton = get_alert_word_automaton(realm)
@@ -134,6 +136,7 @@ def render_incoming_message(
realm=realm,
realm_alert_words_automaton=realm_alert_words_automaton,
mention_data=mention_data,
url_embed_data=url_embed_data,
email_gateway=email_gateway,
)
except MarkdownRenderingException:

View File

@@ -128,36 +128,6 @@ def get_cache_backend(cache_name: Optional[str]) -> BaseCache:
return caches[cache_name]
def get_cache_with_key(
keyfunc: Callable[..., str],
cache_name: Optional[str] = None,
) -> Callable[[FuncT], FuncT]:
"""
The main goal of this function getting value from the cache like in the "cache_with_key".
A cache value can contain any data including the "None", so
here used exception for case if value isn't found in the cache.
"""
def decorator(func: FuncT) -> FuncT:
@wraps(func)
def func_with_caching(*args: object, **kwargs: object) -> object:
key = keyfunc(*args, **kwargs)
try:
val = cache_get(key, cache_name=cache_name)
except InvalidCacheKeyException:
stack_trace = traceback.format_exc()
log_invalid_cache_keys(stack_trace, [key])
val = None
if val is not None:
return val[0]
raise NotFoundInCache()
return cast(FuncT, func_with_caching) # https://github.com/python/mypy/issues/1927
return decorator
def cache_with_key(
keyfunc: Callable[..., str],
cache_name: Optional[str] = None,

View File

@@ -48,7 +48,7 @@ from tlds import tld_set
from typing_extensions import TypedDict
from zerver.lib import mention as mention
from zerver.lib.cache import NotFoundInCache, cache_with_key
from zerver.lib.cache import cache_with_key
from zerver.lib.camo import get_camo_url
from zerver.lib.emoji import EMOTICON_RE, codepoint_to_name, name_to_codepoint, translate_emoticons
from zerver.lib.exceptions import MarkdownRenderingException
@@ -63,7 +63,6 @@ from zerver.lib.timeout import TimeoutExpired, timeout
from zerver.lib.timezone import common_timezones
from zerver.lib.types import LinkifierDict
from zerver.lib.url_encoding import encode_stream, hash_util_encode
from zerver.lib.url_preview import preview as link_preview
from zerver.lib.url_preview.types import UrlEmbedData, UrlOEmbedData
from zerver.models import EmojiInfo, Message, Realm, linkifiers_for_realm
@@ -1333,29 +1332,32 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
if not self.md.url_embed_preview_enabled:
continue
try:
extracted_data = link_preview.link_embed_data_from_cache(url)
except NotFoundInCache:
if self.md.url_embed_data is None or url not in self.md.url_embed_data:
self.md.zulip_rendering_result.links_for_preview.add(url)
continue
if extracted_data:
if youtube is not None:
title = self.youtube_title(extracted_data)
if title is not None:
if url == text:
found_url.family.child.text = title
else:
found_url.family.child.text = text
continue
self.add_embed(root, url, extracted_data)
if self.vimeo_id(url):
title = self.vimeo_title(extracted_data)
if title:
if url == text:
found_url.family.child.text = title
else:
found_url.family.child.text = text
# Existing but being None means that we did process the
# URL, but it was not valid to preview.
extracted_data = self.md.url_embed_data[url]
if extracted_data is None:
continue
if youtube is not None:
title = self.youtube_title(extracted_data)
if title is not None:
if url == text:
found_url.family.child.text = title
else:
found_url.family.child.text = text
continue
self.add_embed(root, url, extracted_data)
if self.vimeo_id(url):
title = self.vimeo_title(extracted_data)
if title:
if url == text:
found_url.family.child.text = title
else:
found_url.family.child.text = text
class CompiledInlineProcessor(markdown.inlinepatterns.InlineProcessor):
@@ -2110,6 +2112,7 @@ class Markdown(markdown.Markdown):
zulip_rendering_result: Optional[MessageRenderingResult]
image_preview_enabled: bool
url_embed_preview_enabled: bool
url_embed_data: Optional[Dict[str, Optional[UrlEmbedData]]]
def __init__(
self,
@@ -2448,6 +2451,7 @@ def do_convert(
message_realm: Optional[Realm] = None,
sent_by_bot: bool = False,
translate_emoticons: bool = False,
url_embed_data: Optional[Dict[str, Optional[UrlEmbedData]]] = None,
mention_data: Optional[MentionData] = None,
email_gateway: bool = False,
no_previews: bool = False,
@@ -2503,6 +2507,7 @@ def do_convert(
_md_engine.url_embed_preview_enabled = url_embed_preview_enabled(
message, message_realm, no_previews
)
_md_engine.url_embed_data = url_embed_data
# Pre-fetch data from the DB that is used in the Markdown thread
if message_realm is not None:
@@ -2606,6 +2611,7 @@ def markdown_convert(
message_realm: Optional[Realm] = None,
sent_by_bot: bool = False,
translate_emoticons: bool = False,
url_embed_data: Optional[Dict[str, Optional[UrlEmbedData]]] = None,
mention_data: Optional[MentionData] = None,
email_gateway: bool = False,
no_previews: bool = False,
@@ -2618,6 +2624,7 @@ def markdown_convert(
message_realm,
sent_by_bot,
translate_emoticons,
url_embed_data,
mention_data,
email_gateway,
no_previews=no_previews,

View File

@@ -39,6 +39,7 @@ from zerver.lib.streams import get_web_public_streams_queryset
from zerver.lib.timestamp import datetime_to_timestamp
from zerver.lib.topic import DB_TOPIC_NAME, MESSAGE__TOPIC, TOPIC_LINKS, TOPIC_NAME
from zerver.lib.types import DisplayRecipientT, EditHistoryEvent, UserDisplayRecipient
from zerver.lib.url_preview.types import UrlEmbedData
from zerver.lib.user_topics import build_topic_mute_checker, topic_is_muted
from zerver.models import (
MAX_TOPIC_NAME_LENGTH,
@@ -899,6 +900,7 @@ def render_markdown(
content: str,
realm: Optional[Realm] = None,
realm_alert_words_automaton: Optional[ahocorasick.Automaton] = None,
url_embed_data: Optional[Dict[str, Optional[UrlEmbedData]]] = None,
mention_data: Optional[MentionData] = None,
email_gateway: bool = False,
) -> MessageRenderingResult:
@@ -920,6 +922,7 @@ def render_markdown(
message_realm=realm,
sent_by_bot=sent_by_bot,
translate_emoticons=translate_emoticons,
url_embed_data=url_embed_data,
mention_data=mention_data,
email_gateway=email_gateway,
)

View File

@@ -8,7 +8,7 @@ from django.conf import settings
from django.utils.encoding import smart_str
from version import ZULIP_VERSION
from zerver.lib.cache import cache_with_key, get_cache_with_key, preview_url_cache_key
from zerver.lib.cache import cache_with_key, preview_url_cache_key
from zerver.lib.outgoing_http import OutgoingSession
from zerver.lib.pysa import mark_sanitized
from zerver.lib.url_preview.oembed import get_oembed_data
@@ -117,10 +117,3 @@ def get_link_embed_data(
if data.image:
data.image = urljoin(response.url, data.image)
return data
@get_cache_with_key(preview_url_cache_key, cache_name=CACHE_NAME)
def link_embed_data_from_cache(
url: str, maxwidth: int = 640, maxheight: int = 480
) -> Optional[UrlEmbedData]:
return None

View File

@@ -1,4 +1,4 @@
from typing import Any, Dict, List, Optional
from typing import Dict, List, Optional
from unittest.mock import Mock, patch
from django.conf import settings
@@ -7,7 +7,6 @@ from zerver.apps import flush_cache
from zerver.lib.cache import (
MEMCACHED_MAX_KEY_LENGTH,
InvalidCacheKeyException,
NotFoundInCache,
bulk_cached_fetch,
cache_delete,
cache_delete_many,
@@ -16,7 +15,6 @@ from zerver.lib.cache import (
cache_set,
cache_set_many,
cache_with_key,
get_cache_with_key,
safe_cache_get_many,
safe_cache_set_many,
user_profile_by_id_cache_key,
@@ -166,43 +164,6 @@ class CacheWithKeyDecoratorTest(ZulipTestCase):
self.assert_length(queries, 0)
class GetCacheWithKeyDecoratorTest(ZulipTestCase):
def test_get_cache_with_good_key(self) -> None:
# Test with a good cache key function, but a get_user function
# that always returns None just to make it convenient to tell
# whether the cache was used (whatever we put in the cache) or
# we got the result from calling the function (None)
def good_cache_key_function(user_id: int) -> str:
return f"CacheWithKeyDecoratorTest:good_cache_key:{user_id}"
@get_cache_with_key(good_cache_key_function)
def get_user_function_with_good_cache_keys(user_id: int) -> Any: # nocoverage
return
hamlet = self.example_user("hamlet")
with self.assertRaises(NotFoundInCache):
get_user_function_with_good_cache_keys(hamlet.id)
cache_set(good_cache_key_function(hamlet.id), hamlet)
result = get_user_function_with_good_cache_keys(hamlet.id)
self.assertEqual(result, hamlet)
def test_get_cache_with_bad_key(self) -> None:
def bad_cache_key_function(user_id: int) -> str:
return f"CacheWithKeyDecoratorTest:invalid_character:ą:{user_id}"
@get_cache_with_key(bad_cache_key_function)
def get_user_function_with_bad_cache_keys(user_id: int) -> Any: # nocoverage
return
hamlet = self.example_user("hamlet")
with self.assertLogs(level="WARNING") as m:
with self.assertRaises(NotFoundInCache):
get_user_function_with_bad_cache_keys(hamlet.id)
self.assert_length(m.output, 1)
class SafeCacheFunctionsTest(ZulipTestCase):
def test_safe_cache_functions_with_all_good_keys(self) -> None:
items = {

View File

@@ -10,14 +10,14 @@ from django.utils.html import escape
from pyoembed.providers import get_provider
from requests.exceptions import ConnectionError
from zerver.lib.cache import NotFoundInCache, cache_set, preview_url_cache_key
from zerver.lib.cache import cache_get, preview_url_cache_key
from zerver.lib.camo import get_camo_url
from zerver.lib.queue import queue_json_publish
from zerver.lib.test_classes import ZulipTestCase
from zerver.lib.test_helpers import mock_queue_publish
from zerver.lib.url_preview.oembed import get_oembed_data, strip_cdata
from zerver.lib.url_preview.parsers import GenericParser, OpenGraphParser
from zerver.lib.url_preview.preview import get_link_embed_data, link_embed_data_from_cache
from zerver.lib.url_preview.preview import get_link_embed_data
from zerver.lib.url_preview.types import UrlEmbedData, UrlOEmbedData
from zerver.models import Message, Realm, UserProfile
from zerver.worker.queue_processors import FetchLinksEmbedData
@@ -634,18 +634,6 @@ class PreviewTestCase(ZulipTestCase):
self.assertIsNone(get_link_embed_data("com.notvalidlink"))
self.assertIsNone(get_link_embed_data("μένει.com.notvalidlink"))
def test_link_embed_data_from_cache(self) -> None:
url = "http://test.org/"
link_embed_data = "test data"
with self.assertRaises(NotFoundInCache):
link_embed_data_from_cache(url)
with self.settings(CACHES=TEST_CACHES):
key = preview_url_cache_key(url)
cache_set(key, link_embed_data, "database")
self.assertEqual(link_embed_data, link_embed_data_from_cache(url))
@responses.activate
@override_settings(INLINE_URL_EMBED_PREVIEW=True)
def test_link_preview_non_html_data(self) -> None:
@@ -665,7 +653,7 @@ class PreviewTestCase(ZulipTestCase):
with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES):
with self.assertLogs(level="INFO") as info_logs:
FetchLinksEmbedData().consume(event)
cached_data = link_embed_data_from_cache(url)
cached_data = cache_get(preview_url_cache_key(url), cache_name="in-memory")[0]
self.assertTrue(
"INFO:root:Time spent on get_link_embed_data for http://test.org/audio.mp3: "
in info_logs.output[0]
@@ -699,13 +687,13 @@ class PreviewTestCase(ZulipTestCase):
with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES):
with self.assertLogs(level="INFO") as info_logs:
FetchLinksEmbedData().consume(event)
cached_data = link_embed_data_from_cache(url)
cached_data = cache_get(preview_url_cache_key(url), cache_name="in-memory")[0]
self.assertTrue(
"INFO:root:Time spent on get_link_embed_data for http://test.org/foo.html: "
in info_logs.output[0]
)
assert cached_data
assert cached_data is not None
self.assertIsNotNone(cached_data.title)
self.assertIsNone(cached_data.image)
msg = Message.objects.select_related("sender").get(id=msg_id)
@@ -738,13 +726,13 @@ class PreviewTestCase(ZulipTestCase):
with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES):
with self.assertLogs(level="INFO") as info_logs:
FetchLinksEmbedData().consume(event)
cached_data = link_embed_data_from_cache(url)
cached_data = cache_get(preview_url_cache_key(url), cache_name="in-memory")[0]
self.assertTrue(
"INFO:root:Time spent on get_link_embed_data for http://test.org/foo.html: "
in info_logs.output[0]
)
assert cached_data
assert cached_data is not None
self.assertIsNotNone(cached_data.title)
self.assertIsNone(cached_data.image)
msg = Message.objects.select_related("sender").get(id=msg_id)
@@ -775,13 +763,13 @@ class PreviewTestCase(ZulipTestCase):
with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES):
with self.assertLogs(level="INFO") as info_logs:
FetchLinksEmbedData().consume(event)
cached_data = link_embed_data_from_cache(url)
cached_data = cache_get(preview_url_cache_key(url), cache_name="in-memory")[0]
self.assertTrue(
"INFO:root:Time spent on get_link_embed_data for http://test.org/foo.html: "
in info_logs.output[0]
)
assert cached_data
assert cached_data is not None
self.assertIsNotNone(cached_data.title)
self.assertIsNone(cached_data.image)
msg = Message.objects.select_related("sender").get(id=msg_id)
@@ -808,17 +796,17 @@ class PreviewTestCase(ZulipTestCase):
with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES):
with self.assertLogs(level="INFO") as info_logs:
FetchLinksEmbedData().consume(event)
data = link_embed_data_from_cache(url)
cached_data = cache_get(preview_url_cache_key(url), cache_name="in-memory")[0]
self.assertTrue(
"INFO:root:Time spent on get_link_embed_data for http://test.org/: "
in info_logs.output[0]
)
assert data is not None
assert cached_data is not None
msg = Message.objects.select_related("sender").get(id=msg_id)
self.assertIn(data.title, msg.rendered_content)
assert data.image is not None
self.assertIn(re.sub(r"([^\w-])", r"\\\1", data.image), msg.rendered_content)
self.assertIn(cached_data.title, msg.rendered_content)
assert cached_data.image is not None
self.assertIn(re.sub(r"([^\w-])", r"\\\1", cached_data.image), msg.rendered_content)
@responses.activate
@override_settings(INLINE_URL_EMBED_PREVIEW=True)
@@ -855,8 +843,9 @@ class PreviewTestCase(ZulipTestCase):
in info_logs.output[0]
)
with self.assertRaises(NotFoundInCache):
link_embed_data_from_cache(url)
# This did not get cached -- hence the lack of [0] on the cache_get
cached_data = cache_get(preview_url_cache_key(url), cache_name="in-memory")
self.assertIsNone(cached_data)
msg.refresh_from_db()
self.assertEqual(
@@ -890,9 +879,10 @@ class PreviewTestCase(ZulipTestCase):
"INFO:root:Time spent on get_link_embed_data for http://test.org/x: "
in info_logs.output[0]
)
cached_data = link_embed_data_from_cache(error_url)
# FIXME: Should we really cache this, especially without cache invalidation?
# FIXME: Should we really cache this, especially without cache invalidation?
cached_data = cache_get(preview_url_cache_key(error_url), cache_name="in-memory")[0]
self.assertIsNone(cached_data)
msg.refresh_from_db()
self.assertEqual(
@@ -931,13 +921,13 @@ class PreviewTestCase(ZulipTestCase):
lambda *args, **kwargs: mocked_data,
):
FetchLinksEmbedData().consume(event)
data = link_embed_data_from_cache(url)
cached_data = cache_get(preview_url_cache_key(url), cache_name="in-memory")[0]
self.assertTrue(
"INFO:root:Time spent on get_link_embed_data for http://test.org/: "
in info_logs.output[0]
)
self.assertEqual(data, mocked_data)
self.assertEqual(cached_data, mocked_data)
msg.refresh_from_db()
self.assertIn('a data-id="{}"'.format(escape(mocked_data.html)), msg.rendered_content)
@@ -966,7 +956,7 @@ class PreviewTestCase(ZulipTestCase):
with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES):
with self.assertLogs(level="INFO") as info_logs:
with mock.patch(
"zerver.lib.markdown.link_preview.link_embed_data_from_cache",
"zerver.worker.queue_processors.url_preview.get_link_embed_data",
lambda *args, **kwargs: mocked_data,
):
FetchLinksEmbedData().consume(event)
@@ -1004,7 +994,7 @@ class PreviewTestCase(ZulipTestCase):
with self.settings(TEST_SUITE=False, CACHES=TEST_CACHES):
with self.assertLogs(level="INFO") as info_logs:
with mock.patch(
"zerver.lib.markdown.link_preview.link_embed_data_from_cache",
"zerver.worker.queue_processors.url_preview.get_link_embed_data",
lambda *args, **kwargs: mocked_data,
):
FetchLinksEmbedData().consume(event)

View File

@@ -469,16 +469,13 @@ class MarkdownTest(ZulipTestCase):
href = "http://" + url
return payload % (f'<a href="{href}">{url}</a>',)
with mock.patch(
"zerver.lib.url_preview.preview.link_embed_data_from_cache", return_value=None
):
for inline_url, reference, url in linkify_tests:
try:
match = replaced(reference, url, phrase=inline_url)
except TypeError:
match = reference
converted = markdown_convert_wrapper(inline_url)
self.assertEqual(match, converted)
for inline_url, reference, url in linkify_tests:
try:
match = replaced(reference, url, phrase=inline_url)
except TypeError:
match = reference
converted = markdown_convert_wrapper(inline_url)
self.assertEqual(match, converted)
def test_inline_file(self) -> None:
msg = "Check out this file file:///Volumes/myserver/Users/Shared/pi.py"

View File

@@ -86,6 +86,7 @@ from zerver.lib.send_email import (
from zerver.lib.timestamp import timestamp_to_datetime
from zerver.lib.upload import handle_reupload_emojis_event
from zerver.lib.url_preview import preview as url_preview
from zerver.lib.url_preview.types import UrlEmbedData
from zerver.models import (
Message,
PreregistrationUser,
@@ -856,9 +857,10 @@ class FetchLinksEmbedData(QueueProcessingWorker):
CONSUME_ITERATIONS_BEFORE_UPDATE_STATS_NUM = 1
def consume(self, event: Mapping[str, Any]) -> None:
url_embed_data: Dict[str, Optional[UrlEmbedData]] = {}
for url in event["urls"]:
start_time = time.time()
url_preview.get_link_embed_data(url)
url_embed_data[url] = url_preview.get_link_embed_data(url)
logging.info(
"Time spent on get_link_embed_data for %s: %s", url, time.time() - start_time
)
@@ -879,7 +881,11 @@ class FetchLinksEmbedData(QueueProcessingWorker):
# If rendering fails, the called code will raise a JsonableError.
rendering_result = render_incoming_message(
message, message.content, message_user_ids, realm
message,
message.content,
message_user_ids,
realm,
url_embed_data=url_embed_data,
)
do_update_embedded_data(message.sender, message, message.content, rendering_result)