mirror of
				https://github.com/zulip/zulip.git
				synced 2025-11-04 05:53:43 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			62 lines
		
	
	
		
			2.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			62 lines
		
	
	
		
			2.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from urllib.parse import urlsplit
 | 
						|
 | 
						|
from bs4.element import Tag
 | 
						|
from typing_extensions import override
 | 
						|
 | 
						|
from zerver.lib.url_preview.parsers.base import BaseParser
 | 
						|
from zerver.lib.url_preview.types import UrlEmbedData
 | 
						|
 | 
						|
 | 
						|
class GenericParser(BaseParser):
 | 
						|
    @override
 | 
						|
    def extract_data(self) -> UrlEmbedData:
 | 
						|
        return UrlEmbedData(
 | 
						|
            title=self._get_title(),
 | 
						|
            description=self._get_description(),
 | 
						|
            image=self._get_image(),
 | 
						|
        )
 | 
						|
 | 
						|
    def _get_title(self) -> str | None:
 | 
						|
        soup = self._soup
 | 
						|
        if soup.title and soup.title.text != "":
 | 
						|
            return soup.title.text
 | 
						|
        if soup.h1 and soup.h1.text != "":
 | 
						|
            return soup.h1.text
 | 
						|
        return None
 | 
						|
 | 
						|
    def _get_description(self) -> str | None:
 | 
						|
        soup = self._soup
 | 
						|
        meta_description = soup.find("meta", attrs={"name": "description"})
 | 
						|
        if isinstance(meta_description, Tag) and meta_description.get("content", "") != "":
 | 
						|
            assert isinstance(meta_description["content"], str)
 | 
						|
            return meta_description["content"]
 | 
						|
        first_h1 = soup.find("h1")
 | 
						|
        if first_h1:
 | 
						|
            first_p = first_h1.find_next("p")
 | 
						|
            if first_p and first_p.text != "":
 | 
						|
                return first_p.text
 | 
						|
        first_p = soup.find("p")
 | 
						|
        if first_p and first_p.text != "":
 | 
						|
            return first_p.text
 | 
						|
        return None
 | 
						|
 | 
						|
    def _get_image(self) -> str | None:
 | 
						|
        """
 | 
						|
        Finding a first image after the h1 header.
 | 
						|
        Presumably it will be the main image.
 | 
						|
        """
 | 
						|
        soup = self._soup
 | 
						|
        first_h1 = soup.find("h1")
 | 
						|
        if first_h1:
 | 
						|
            first_image = first_h1.find_next_sibling("img", src=True)
 | 
						|
            if isinstance(first_image, Tag) and first_image["src"] != "":
 | 
						|
                assert isinstance(first_image["src"], str)
 | 
						|
                try:
 | 
						|
                    # We use urlsplit and not URLValidator because we
 | 
						|
                    # need to support relative URLs.
 | 
						|
                    urlsplit(first_image["src"])
 | 
						|
                except ValueError:
 | 
						|
                    return None
 | 
						|
                return first_image["src"]
 | 
						|
        return None
 |