mirror of
https://github.com/zulip/zulip.git
synced 2025-11-09 00:18:12 +00:00
Fixes #2665. Regenerated by tabbott with `lint --fix` after a rebase and change in parameters. Note from tabbott: In a few cases, this converts technical debt in the form of unsorted imports into different technical debt in the form of our largest files having very long, ugly import sequences at the start. I expect this change will increase pressure for us to split those files, which isn't a bad thing. Signed-off-by: Anders Kaseorg <anders@zulip.com>
48 lines
1.6 KiB
Python
48 lines
1.6 KiB
Python
from typing import Dict, Optional
|
|
|
|
from zerver.lib.url_preview.parsers.base import BaseParser
|
|
|
|
|
|
class GenericParser(BaseParser):
|
|
def extract_data(self) -> Dict[str, Optional[str]]:
|
|
return {
|
|
'title': self._get_title(),
|
|
'description': self._get_description(),
|
|
'image': self._get_image()}
|
|
|
|
def _get_title(self) -> Optional[str]:
|
|
soup = self._soup
|
|
if (soup.title and soup.title.text != ''):
|
|
return soup.title.text
|
|
if (soup.h1 and soup.h1.text != ''):
|
|
return soup.h1.text
|
|
return None
|
|
|
|
def _get_description(self) -> Optional[str]:
|
|
soup = self._soup
|
|
meta_description = soup.find('meta', attrs={'name': 'description'})
|
|
if (meta_description and meta_description.get('content', '') != ''):
|
|
return meta_description['content']
|
|
first_h1 = soup.find('h1')
|
|
if first_h1:
|
|
first_p = first_h1.find_next('p')
|
|
if (first_p and first_p.text != ''):
|
|
return first_p.text
|
|
first_p = soup.find('p')
|
|
if (first_p and first_p.text != ''):
|
|
return first_p.text
|
|
return None
|
|
|
|
def _get_image(self) -> Optional[str]:
|
|
"""
|
|
Finding a first image after the h1 header.
|
|
Presumably it will be the main image.
|
|
"""
|
|
soup = self._soup
|
|
first_h1 = soup.find('h1')
|
|
if first_h1:
|
|
first_image = first_h1.find_next_sibling('img')
|
|
if first_image and first_image['src'] != '':
|
|
return first_image['src']
|
|
return None
|