mirror of
				https://github.com/zulip/zulip.git
				synced 2025-11-04 05:53:43 +00:00 
			
		
		
		
	markdown: Rewrite YouTube URL parser without regex spaghetti.
This also adds support for the new YouTube Shorts URLs. Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
		
				
					committed by
					
						
						Tim Abbott
					
				
			
			
				
	
			
			
			
						parent
						
							53aa3f6c71
						
					
				
				
					commit
					0a1904a6a7
				
			@@ -26,7 +26,7 @@ from typing import (
 | 
				
			|||||||
    TypeVar,
 | 
					    TypeVar,
 | 
				
			||||||
    Union,
 | 
					    Union,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
from urllib.parse import urlencode, urljoin, urlsplit
 | 
					from urllib.parse import parse_qs, urlencode, urljoin, urlsplit
 | 
				
			||||||
from xml.etree.ElementTree import Element, SubElement
 | 
					from xml.etree.ElementTree import Element, SubElement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import ahocorasick
 | 
					import ahocorasick
 | 
				
			||||||
@@ -811,28 +811,30 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
 | 
				
			|||||||
    def youtube_id(self, url: str) -> Optional[str]:
 | 
					    def youtube_id(self, url: str) -> Optional[str]:
 | 
				
			||||||
        if not self.zmd.image_preview_enabled:
 | 
					        if not self.zmd.image_preview_enabled:
 | 
				
			||||||
            return None
 | 
					            return None
 | 
				
			||||||
        # YouTube video id extraction regular expression from https://pastebin.com/KyKAFv1s
 | 
					
 | 
				
			||||||
        # Slightly modified to support URLs of the forms
 | 
					        id = None
 | 
				
			||||||
        #   - youtu.be/<id>
 | 
					        split_url = urlsplit(url)
 | 
				
			||||||
        #   - youtube.com/playlist?v=<id>&list=<list-id>
 | 
					        if split_url.scheme in ("http", "https"):
 | 
				
			||||||
        #   - youtube.com/watch_videos?video_ids=<id1>,<id2>,<id3>
 | 
					            if split_url.hostname in (
 | 
				
			||||||
        # If it matches, match.group(2) is the video id.
 | 
					                "m.youtube.com",
 | 
				
			||||||
        schema_re = r"(?:https?://)"
 | 
					                "www.youtube.com",
 | 
				
			||||||
        host_re = r"(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)"
 | 
					                "www.youtube-nocookie.com",
 | 
				
			||||||
        param_re = (
 | 
					                "youtube.com",
 | 
				
			||||||
            r"(?:(?:(?:v|embed)/)"
 | 
					                "youtube-nocookie.com",
 | 
				
			||||||
            r"|(?:(?:(?:watch|playlist)(?:_popup|_videos)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v(?:ideo_ids)?=))"
 | 
					            ):
 | 
				
			||||||
        )
 | 
					                query = parse_qs(split_url.query)
 | 
				
			||||||
        id_re = r"([0-9A-Za-z_-]+)"
 | 
					                if split_url.path in ("/watch", "/watch_popup") and "v" in query:
 | 
				
			||||||
        youtube_re = r"^({schema_re}?{host_re}{param_re}?)?{id_re}(?(1).+)?$"
 | 
					                    id = query["v"][0]
 | 
				
			||||||
        youtube_re = youtube_re.format(
 | 
					                elif split_url.path == "/watch_videos" and "video_ids" in query:
 | 
				
			||||||
            schema_re=schema_re, host_re=host_re, id_re=id_re, param_re=param_re
 | 
					                    id = query["video_ids"][0].split(",", 1)[0]
 | 
				
			||||||
        )
 | 
					                elif split_url.path.startswith(("/embed/", "/shorts/", "/v/")):
 | 
				
			||||||
        match = re.match(youtube_re, url)
 | 
					                    id = split_url.path.split("/", 3)[2]
 | 
				
			||||||
        # URLs of the form youtube.com/playlist?list=<list-id> are incorrectly matched
 | 
					            elif split_url.hostname == "youtu.be" and split_url.path.startswith("/"):
 | 
				
			||||||
        if match is None or match.group(2) == "playlist":
 | 
					                id = split_url.path[len("/") :]
 | 
				
			||||||
            return None
 | 
					
 | 
				
			||||||
        return match.group(2)
 | 
					        if id is not None and re.fullmatch(r"[0-9A-Za-z_-]+", id):
 | 
				
			||||||
 | 
					            return id
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def youtube_title(self, extracted_data: UrlEmbedData) -> Optional[str]:
 | 
					    def youtube_title(self, extracted_data: UrlEmbedData) -> Optional[str]:
 | 
				
			||||||
        if extracted_data.title is not None:
 | 
					        if extracted_data.title is not None:
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -537,14 +537,12 @@ class MarkdownTest(ZulipTestCase):
 | 
				
			|||||||
            '<p><a href="https://www.youtube.com/playlist?list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo">https://www.youtube.com/playlist?list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo</a></p>',
 | 
					            '<p><a href="https://www.youtube.com/playlist?list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo">https://www.youtube.com/playlist?list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo</a></p>',
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        msg = (
 | 
					        msg = "https://www.youtube.com/watch?v=O5nskjZ_GoI&list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo"
 | 
				
			||||||
            "https://www.youtube.com/playlist?v=O5nskjZ_GoI&list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo"
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        converted = markdown_convert_wrapper(msg)
 | 
					        converted = markdown_convert_wrapper(msg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.assertEqual(
 | 
					        self.assertEqual(
 | 
				
			||||||
            converted,
 | 
					            converted,
 | 
				
			||||||
            f"""<p><a href="https://www.youtube.com/playlist?v=O5nskjZ_GoI&list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo">https://www.youtube.com/playlist?v=O5nskjZ_GoI&list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo</a></p>\n<div class="youtube-video message_inline_image"><a data-id="O5nskjZ_GoI" href="https://www.youtube.com/playlist?v=O5nskjZ_GoI&list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo"><img src="{get_camo_url("https://i.ytimg.com/vi/O5nskjZ_GoI/default.jpg")}"></a></div>""",
 | 
					            f"""<p><a href="https://www.youtube.com/watch?v=O5nskjZ_GoI&list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo">https://www.youtube.com/watch?v=O5nskjZ_GoI&list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo</a></p>\n<div class="youtube-video message_inline_image"><a data-id="O5nskjZ_GoI" href="https://www.youtube.com/watch?v=O5nskjZ_GoI&list=PL8dPuuaLjXtNlUrzyH5r6jN9ulIgZBpdo"><img src="{get_camo_url("https://i.ytimg.com/vi/O5nskjZ_GoI/default.jpg")}"></a></div>""",
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        msg = "http://www.youtube.com/watch_videos?video_ids=nOJgD4fcZhI,i96UO8-GFvw"
 | 
					        msg = "http://www.youtube.com/watch_videos?video_ids=nOJgD4fcZhI,i96UO8-GFvw"
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user