diff --git a/zerver/lib/markdown/__init__.py b/zerver/lib/markdown/__init__.py index 65052dc002..44393e566f 100644 --- a/zerver/lib/markdown/__init__.py +++ b/zerver/lib/markdown/__init__.py @@ -26,7 +26,7 @@ from typing import ( TypeVar, Union, ) -from urllib.parse import urlencode, urljoin, urlsplit +from urllib.parse import parse_qs, urlencode, urljoin, urlsplit from xml.etree.ElementTree import Element, SubElement import ahocorasick @@ -811,28 +811,30 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor): def youtube_id(self, url: str) -> Optional[str]: if not self.zmd.image_preview_enabled: return None - # YouTube video id extraction regular expression from https://pastebin.com/KyKAFv1s - # Slightly modified to support URLs of the forms - # - youtu.be/ - # - youtube.com/playlist?v=&list= - # - youtube.com/watch_videos?video_ids=,, - # If it matches, match.group(2) is the video id. - schema_re = r"(?:https?://)" - host_re = r"(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)" - param_re = ( - r"(?:(?:(?:v|embed)/)" - r"|(?:(?:(?:watch|playlist)(?:_popup|_videos)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v(?:ideo_ids)?=))" - ) - id_re = r"([0-9A-Za-z_-]+)" - youtube_re = r"^({schema_re}?{host_re}{param_re}?)?{id_re}(?(1).+)?$" - youtube_re = youtube_re.format( - schema_re=schema_re, host_re=host_re, id_re=id_re, param_re=param_re - ) - match = re.match(youtube_re, url) - # URLs of the form youtube.com/playlist?list= are incorrectly matched - if match is None or match.group(2) == "playlist": - return None - return match.group(2) + + id = None + split_url = urlsplit(url) + if split_url.scheme in ("http", "https"): + if split_url.hostname in ( + "m.youtube.com", + "www.youtube.com", + "www.youtube-nocookie.com", + "youtube.com", + "youtube-nocookie.com", + ): + query = parse_qs(split_url.query) + if split_url.path in ("/watch", "/watch_popup") and "v" in query: + id = query["v"][0] + elif split_url.path == "/watch_videos" and "video_ids" in query: + id = query["video_ids"][0].split(",", 1)[0] + elif split_url.path.startswith(("/embed/", "/shorts/", "/v/")): + id = split_url.path.split("/", 3)[2] + elif split_url.hostname == "youtu.be" and split_url.path.startswith("/"): + id = split_url.path[len("/") :] + + if id is not None and re.fullmatch(r"[0-9A-Za-z_-]+", id): + return id + return None def youtube_title(self, extracted_data: UrlEmbedData) -> Optional[str]: if extracted_data.title is not None: diff --git a/zerver/tests/test_markdown.py b/zerver/tests/test_markdown.py index 78a4af56cb..5f5ce41c94 100644 --- a/zerver/tests/test_markdown.py +++ b/zerver/tests/test_markdown.py @@ -537,14 +537,12 @@ class MarkdownTest(ZulipTestCase): '

https://www.youtube.com/playlist?list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo

', ) - msg = ( - "https://www.youtube.com/playlist?v=O5nskjZ_GoI&list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo" - ) + msg = "https://www.youtube.com/watch?v=O5nskjZ_GoI&list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo" converted = markdown_convert_wrapper(msg) self.assertEqual( converted, - f"""

https://www.youtube.com/playlist?v=O5nskjZ_GoI&list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo

\n
""", + f"""

https://www.youtube.com/watch?v=O5nskjZ_GoI&list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo

\n
""", ) msg = "http://www.youtube.com/watch_videos?video_ids=nOJgD4fcZhI,i96UO8-GFvw"