mirror of
https://github.com/zulip/zulip.git
synced 2025-11-06 06:53:25 +00:00
python: Normalize quotes with Black.
Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
committed by
Tim Abbott
parent
11741543da
commit
6e4c3e41dc
@@ -7,20 +7,20 @@
|
||||
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
|
||||
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'documentation_crawler'
|
||||
BOT_NAME = "documentation_crawler"
|
||||
|
||||
SPIDER_MODULES = ['documentation_crawler.spiders']
|
||||
NEWSPIDER_MODULE = 'documentation_crawler.spiders'
|
||||
COMMANDS_MODULE = 'documentation_crawler.commands'
|
||||
LOG_LEVEL = 'WARNING'
|
||||
SPIDER_MODULES = ["documentation_crawler.spiders"]
|
||||
NEWSPIDER_MODULE = "documentation_crawler.spiders"
|
||||
COMMANDS_MODULE = "documentation_crawler.commands"
|
||||
LOG_LEVEL = "WARNING"
|
||||
DOWNLOAD_TIMEOUT = 15
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = (
|
||||
'Mozilla/5.0 (X11; Linux x86_64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/54.0.2840.59 Safari/537.36'
|
||||
"Mozilla/5.0 (X11; Linux x86_64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/54.0.2840.59 Safari/537.36"
|
||||
)
|
||||
|
||||
# Obey robots.txt rules
|
||||
|
||||
@@ -18,6 +18,6 @@ def get_start_url() -> List[str]:
|
||||
|
||||
class DocumentationSpider(BaseDocumentationSpider):
|
||||
name = "documentation_crawler"
|
||||
deny_domains = ['localhost:9991']
|
||||
deny = [r'\_sources\/.*\.txt']
|
||||
deny_domains = ["localhost:9991"]
|
||||
deny = [r"\_sources\/.*\.txt"]
|
||||
start_urls = get_start_url()
|
||||
|
||||
@@ -22,8 +22,8 @@ class UnusedImagesLinterSpider(BaseDocumentationSpider):
|
||||
self.images_static_dir: str = get_images_dir(self.images_path)
|
||||
|
||||
def _is_external_url(self, url: str) -> bool:
|
||||
is_external = url.startswith('http') and self.start_urls[0] not in url
|
||||
if self._has_extension(url) and f'localhost:9981/{self.images_path}' in url:
|
||||
is_external = url.startswith("http") and self.start_urls[0] not in url
|
||||
if self._has_extension(url) and f"localhost:9981/{self.images_path}" in url:
|
||||
self.static_images.add(basename(urlparse(url).path))
|
||||
return is_external or self._has_extension(url)
|
||||
|
||||
@@ -37,20 +37,20 @@ class UnusedImagesLinterSpider(BaseDocumentationSpider):
|
||||
unused_images_relatedpath = [
|
||||
os.path.join(self.images_path, img) for img in unused_images
|
||||
]
|
||||
raise Exception(exception_message.format(', '.join(unused_images_relatedpath)))
|
||||
raise Exception(exception_message.format(", ".join(unused_images_relatedpath)))
|
||||
|
||||
|
||||
class HelpDocumentationSpider(UnusedImagesLinterSpider):
|
||||
name = "help_documentation_crawler"
|
||||
start_urls = ['http://localhost:9981/help']
|
||||
start_urls = ["http://localhost:9981/help"]
|
||||
deny_domains: List[str] = []
|
||||
deny = ['/privacy']
|
||||
deny = ["/privacy"]
|
||||
images_path = "static/images/help"
|
||||
|
||||
|
||||
class APIDocumentationSpider(UnusedImagesLinterSpider):
|
||||
name = 'api_documentation_crawler'
|
||||
start_urls = ['http://localhost:9981/api']
|
||||
name = "api_documentation_crawler"
|
||||
start_urls = ["http://localhost:9981/api"]
|
||||
deny_domains: List[str] = []
|
||||
images_path = "static/images/api"
|
||||
|
||||
@@ -58,28 +58,28 @@ class APIDocumentationSpider(UnusedImagesLinterSpider):
|
||||
class PorticoDocumentationSpider(BaseDocumentationSpider):
|
||||
def _is_external_url(self, url: str) -> bool:
|
||||
return (
|
||||
not url.startswith('http://localhost:9981')
|
||||
or url.startswith('http://localhost:9981/help')
|
||||
or url.startswith('http://localhost:9981/api')
|
||||
not url.startswith("http://localhost:9981")
|
||||
or url.startswith("http://localhost:9981/help")
|
||||
or url.startswith("http://localhost:9981/api")
|
||||
or self._has_extension(url)
|
||||
)
|
||||
|
||||
name = 'portico_documentation_crawler'
|
||||
name = "portico_documentation_crawler"
|
||||
start_urls = [
|
||||
'http://localhost:9981/hello',
|
||||
'http://localhost:9981/history',
|
||||
'http://localhost:9981/plans',
|
||||
'http://localhost:9981/team',
|
||||
'http://localhost:9981/apps',
|
||||
'http://localhost:9981/integrations',
|
||||
'http://localhost:9981/terms',
|
||||
'http://localhost:9981/privacy',
|
||||
'http://localhost:9981/features',
|
||||
'http://localhost:9981/why-zulip',
|
||||
'http://localhost:9981/for/open-source',
|
||||
'http://localhost:9981/for/companies',
|
||||
'http://localhost:9981/for/working-groups-and-communities',
|
||||
'http://localhost:9981/for/research',
|
||||
'http://localhost:9981/security',
|
||||
"http://localhost:9981/hello",
|
||||
"http://localhost:9981/history",
|
||||
"http://localhost:9981/plans",
|
||||
"http://localhost:9981/team",
|
||||
"http://localhost:9981/apps",
|
||||
"http://localhost:9981/integrations",
|
||||
"http://localhost:9981/terms",
|
||||
"http://localhost:9981/privacy",
|
||||
"http://localhost:9981/features",
|
||||
"http://localhost:9981/why-zulip",
|
||||
"http://localhost:9981/for/open-source",
|
||||
"http://localhost:9981/for/companies",
|
||||
"http://localhost:9981/for/working-groups-and-communities",
|
||||
"http://localhost:9981/for/research",
|
||||
"http://localhost:9981/security",
|
||||
]
|
||||
deny_domains: List[str] = []
|
||||
|
||||
@@ -13,34 +13,34 @@ from twisted.python.failure import Failure
|
||||
|
||||
EXCLUDED_URLS = [
|
||||
# Google calendar returns 404s on HEAD requests unconditionally
|
||||
'https://calendar.google.com/calendar/embed?src=ktiduof4eoh47lmgcl2qunnc0o@group.calendar.google.com',
|
||||
"https://calendar.google.com/calendar/embed?src=ktiduof4eoh47lmgcl2qunnc0o@group.calendar.google.com",
|
||||
# Returns 409 errors to HEAD requests frequently
|
||||
'https://medium.freecodecamp.org/',
|
||||
"https://medium.freecodecamp.org/",
|
||||
# Returns 404 to HEAD requests unconditionally
|
||||
'https://www.git-tower.com/blog/command-line-cheat-sheet/',
|
||||
'https://marketplace.visualstudio.com/items?itemName=rafaelmaiolla.remote-vscode',
|
||||
"https://www.git-tower.com/blog/command-line-cheat-sheet/",
|
||||
"https://marketplace.visualstudio.com/items?itemName=rafaelmaiolla.remote-vscode",
|
||||
# Requires authentication
|
||||
'https://circleci.com/gh/zulip/zulip/tree/master',
|
||||
'https://circleci.com/gh/zulip/zulip/16617',
|
||||
'https://www.linkedin.com/company/zulip-project',
|
||||
"https://circleci.com/gh/zulip/zulip/tree/master",
|
||||
"https://circleci.com/gh/zulip/zulip/16617",
|
||||
"https://www.linkedin.com/company/zulip-project",
|
||||
# Returns 403 errors to HEAD requests
|
||||
'https://giphy.com',
|
||||
'https://giphy.com/apps/giphycapture',
|
||||
'https://www.udemy.com/course/the-complete-react-native-and-redux-course/',
|
||||
"https://giphy.com",
|
||||
"https://giphy.com/apps/giphycapture",
|
||||
"https://www.udemy.com/course/the-complete-react-native-and-redux-course/",
|
||||
]
|
||||
|
||||
VNU_IGNORE = [
|
||||
# Real errors that should be fixed.
|
||||
r'Duplicate ID “[^”]*”\.',
|
||||
r'The first occurrence of ID “[^”]*” was here\.',
|
||||
r'Attribute “markdown” not allowed on element “div” at this point\.',
|
||||
r'No “p” element in scope but a “p” end tag seen\.',
|
||||
r'Element “div” not allowed as child of element “ul” in this context\. '
|
||||
+ r'\(Suppressing further errors from this subtree\.\)',
|
||||
r"Duplicate ID “[^”]*”\.",
|
||||
r"The first occurrence of ID “[^”]*” was here\.",
|
||||
r"Attribute “markdown” not allowed on element “div” at this point\.",
|
||||
r"No “p” element in scope but a “p” end tag seen\.",
|
||||
r"Element “div” not allowed as child of element “ul” in this context\. "
|
||||
+ r"\(Suppressing further errors from this subtree\.\)",
|
||||
# Warnings that are probably less important.
|
||||
r'The “type” attribute is unnecessary for JavaScript resources\.',
|
||||
r"The “type” attribute is unnecessary for JavaScript resources\.",
|
||||
]
|
||||
VNU_IGNORE_REGEX = re.compile(r'|'.join(VNU_IGNORE))
|
||||
VNU_IGNORE_REGEX = re.compile(r"|".join(VNU_IGNORE))
|
||||
|
||||
DEPLOY_ROOT = os.path.abspath(os.path.join(__file__, "../../../../../.."))
|
||||
|
||||
@@ -54,15 +54,15 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
deny_domains: List[str] = []
|
||||
start_urls: List[str] = []
|
||||
deny: List[str] = []
|
||||
file_extensions: List[str] = ['.' + ext for ext in IGNORED_EXTENSIONS]
|
||||
tags = ('a', 'area', 'img')
|
||||
attrs = ('href', 'src')
|
||||
file_extensions: List[str] = ["." + ext for ext in IGNORED_EXTENSIONS]
|
||||
tags = ("a", "area", "img")
|
||||
attrs = ("href", "src")
|
||||
|
||||
def _has_extension(self, url: str) -> bool:
|
||||
return url_has_any_extension(url, self.file_extensions)
|
||||
|
||||
def _is_external_url(self, url: str) -> bool:
|
||||
return url.startswith('http') or self._has_extension(url)
|
||||
return url.startswith("http") or self._has_extension(url)
|
||||
|
||||
def check_existing(self, response: Response) -> None:
|
||||
self.log(response)
|
||||
@@ -85,7 +85,7 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
):
|
||||
# We can verify these links directly in the local git repo without making any requests to GitHub servers.
|
||||
return False
|
||||
if 'github.com/zulip' in url:
|
||||
if "github.com/zulip" in url:
|
||||
# We want to check these links but due to rate limiting from GitHub, these checks often
|
||||
# fail in the CI. Thus, we should treat these as external links for now.
|
||||
# TODO: Figure out how to test github.com/zulip links in CI.
|
||||
@@ -98,7 +98,7 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
m = re.match(r".+\#(?P<fragment>.*)$", response.request.url) # Get fragment value.
|
||||
if not m:
|
||||
return
|
||||
fragment = m.group('fragment')
|
||||
fragment = m.group("fragment")
|
||||
# Check fragment existing on response page.
|
||||
if not response.selector.xpath(xpath_template.format(fragment=fragment)):
|
||||
self.logger.error(
|
||||
@@ -108,17 +108,17 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
def _vnu_callback(self, url: str) -> Callable[[Response], None]:
|
||||
def callback(response: Response) -> None:
|
||||
vnu_out = json.loads(response.text)
|
||||
for message in vnu_out['messages']:
|
||||
if not VNU_IGNORE_REGEX.fullmatch(message['message']):
|
||||
for message in vnu_out["messages"]:
|
||||
if not VNU_IGNORE_REGEX.fullmatch(message["message"]):
|
||||
self.logger.error(
|
||||
'"%s":%d.%d-%d.%d: %s: %s',
|
||||
url,
|
||||
message.get('firstLine', message['lastLine']),
|
||||
message.get('firstColumn', message['lastColumn']),
|
||||
message['lastLine'],
|
||||
message['lastColumn'],
|
||||
message['type'],
|
||||
message['message'],
|
||||
message.get("firstLine", message["lastLine"]),
|
||||
message.get("firstColumn", message["lastColumn"]),
|
||||
message["lastLine"],
|
||||
message["lastColumn"],
|
||||
message["type"],
|
||||
message["message"],
|
||||
)
|
||||
|
||||
return callback
|
||||
@@ -129,18 +129,18 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
# crawl documentation served by the webapp (E.g. /help/), we
|
||||
# don't want to crawl the webapp itself, so we exclude these.
|
||||
if (
|
||||
url in ['http://localhost:9981/', 'http://localhost:9981']
|
||||
or url.startswith('http://localhost:9981/#')
|
||||
or url.startswith('http://localhost:9981#')
|
||||
url in ["http://localhost:9981/", "http://localhost:9981"]
|
||||
or url.startswith("http://localhost:9981/#")
|
||||
or url.startswith("http://localhost:9981#")
|
||||
):
|
||||
return
|
||||
|
||||
callback: Callable[[Response], Optional[Iterator[Request]]] = self.parse
|
||||
dont_filter = False
|
||||
method = 'GET'
|
||||
method = "GET"
|
||||
if self._is_external_url(url):
|
||||
callback = self.check_existing
|
||||
method = 'HEAD'
|
||||
method = "HEAD"
|
||||
|
||||
if url.startswith(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX):
|
||||
file_path = url.replace(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX, DEPLOY_ROOT)
|
||||
@@ -159,10 +159,10 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
"There is no local directory associated with the GitHub URL: %s", url
|
||||
)
|
||||
return
|
||||
elif '#' in url:
|
||||
elif "#" in url:
|
||||
dont_filter = True
|
||||
callback = self.check_fragment
|
||||
if getattr(self, 'skip_external', False) and self._is_external_link(url):
|
||||
if getattr(self, "skip_external", False) and self._is_external_link(url):
|
||||
return
|
||||
yield Request(
|
||||
url,
|
||||
@@ -179,11 +179,11 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
def parse(self, response: Response) -> Iterator[Request]:
|
||||
self.log(response)
|
||||
|
||||
if getattr(self, 'validate_html', False):
|
||||
if getattr(self, "validate_html", False):
|
||||
yield Request(
|
||||
'http://127.0.0.1:9988/?out=json',
|
||||
method='POST',
|
||||
headers={'Content-Type': response.headers['Content-Type']},
|
||||
"http://127.0.0.1:9988/?out=json",
|
||||
method="POST",
|
||||
headers={"Content-Type": response.headers["Content-Type"]},
|
||||
body=response.body,
|
||||
callback=self._vnu_callback(response.url),
|
||||
errback=self.error_callback,
|
||||
@@ -191,7 +191,7 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
|
||||
for link in LxmlLinkExtractor(
|
||||
deny_domains=self.deny_domains,
|
||||
deny_extensions=['doc'],
|
||||
deny_extensions=["doc"],
|
||||
tags=self.tags,
|
||||
attrs=self.attrs,
|
||||
deny=self.deny,
|
||||
@@ -200,7 +200,7 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
yield from self._make_requests(link.url)
|
||||
|
||||
def retry_request_with_get(self, request: Request) -> Iterator[Request]:
|
||||
request.method = 'GET'
|
||||
request.method = "GET"
|
||||
request.dont_filter = True
|
||||
yield request
|
||||
|
||||
@@ -212,7 +212,7 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
response = failure.value.response
|
||||
if self.exclude_error(response.url):
|
||||
return None
|
||||
if response.status == 405 and response.request.method == 'HEAD':
|
||||
if response.status == 405 and response.request.method == "HEAD":
|
||||
# Method 'HEAD' not allowed, repeat request with 'GET'
|
||||
return self.retry_request_with_get(response.request)
|
||||
self.logger.error("Please check link: %s", response.request.url)
|
||||
|
||||
Reference in New Issue
Block a user