python: Normalize quotes with Black.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
Anders Kaseorg
2021-02-11 23:20:45 -08:00
committed by Tim Abbott
parent 11741543da
commit 6e4c3e41dc
989 changed files with 32792 additions and 32792 deletions

View File

@@ -7,20 +7,20 @@
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'documentation_crawler'
BOT_NAME = "documentation_crawler"
SPIDER_MODULES = ['documentation_crawler.spiders']
NEWSPIDER_MODULE = 'documentation_crawler.spiders'
COMMANDS_MODULE = 'documentation_crawler.commands'
LOG_LEVEL = 'WARNING'
SPIDER_MODULES = ["documentation_crawler.spiders"]
NEWSPIDER_MODULE = "documentation_crawler.spiders"
COMMANDS_MODULE = "documentation_crawler.commands"
LOG_LEVEL = "WARNING"
DOWNLOAD_TIMEOUT = 15
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = (
'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/54.0.2840.59 Safari/537.36'
"Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/54.0.2840.59 Safari/537.36"
)
# Obey robots.txt rules

View File

@@ -18,6 +18,6 @@ def get_start_url() -> List[str]:
class DocumentationSpider(BaseDocumentationSpider):
name = "documentation_crawler"
deny_domains = ['localhost:9991']
deny = [r'\_sources\/.*\.txt']
deny_domains = ["localhost:9991"]
deny = [r"\_sources\/.*\.txt"]
start_urls = get_start_url()

View File

@@ -22,8 +22,8 @@ class UnusedImagesLinterSpider(BaseDocumentationSpider):
self.images_static_dir: str = get_images_dir(self.images_path)
def _is_external_url(self, url: str) -> bool:
is_external = url.startswith('http') and self.start_urls[0] not in url
if self._has_extension(url) and f'localhost:9981/{self.images_path}' in url:
is_external = url.startswith("http") and self.start_urls[0] not in url
if self._has_extension(url) and f"localhost:9981/{self.images_path}" in url:
self.static_images.add(basename(urlparse(url).path))
return is_external or self._has_extension(url)
@@ -37,20 +37,20 @@ class UnusedImagesLinterSpider(BaseDocumentationSpider):
unused_images_relatedpath = [
os.path.join(self.images_path, img) for img in unused_images
]
raise Exception(exception_message.format(', '.join(unused_images_relatedpath)))
raise Exception(exception_message.format(", ".join(unused_images_relatedpath)))
class HelpDocumentationSpider(UnusedImagesLinterSpider):
name = "help_documentation_crawler"
start_urls = ['http://localhost:9981/help']
start_urls = ["http://localhost:9981/help"]
deny_domains: List[str] = []
deny = ['/privacy']
deny = ["/privacy"]
images_path = "static/images/help"
class APIDocumentationSpider(UnusedImagesLinterSpider):
name = 'api_documentation_crawler'
start_urls = ['http://localhost:9981/api']
name = "api_documentation_crawler"
start_urls = ["http://localhost:9981/api"]
deny_domains: List[str] = []
images_path = "static/images/api"
@@ -58,28 +58,28 @@ class APIDocumentationSpider(UnusedImagesLinterSpider):
class PorticoDocumentationSpider(BaseDocumentationSpider):
def _is_external_url(self, url: str) -> bool:
return (
not url.startswith('http://localhost:9981')
or url.startswith('http://localhost:9981/help')
or url.startswith('http://localhost:9981/api')
not url.startswith("http://localhost:9981")
or url.startswith("http://localhost:9981/help")
or url.startswith("http://localhost:9981/api")
or self._has_extension(url)
)
name = 'portico_documentation_crawler'
name = "portico_documentation_crawler"
start_urls = [
'http://localhost:9981/hello',
'http://localhost:9981/history',
'http://localhost:9981/plans',
'http://localhost:9981/team',
'http://localhost:9981/apps',
'http://localhost:9981/integrations',
'http://localhost:9981/terms',
'http://localhost:9981/privacy',
'http://localhost:9981/features',
'http://localhost:9981/why-zulip',
'http://localhost:9981/for/open-source',
'http://localhost:9981/for/companies',
'http://localhost:9981/for/working-groups-and-communities',
'http://localhost:9981/for/research',
'http://localhost:9981/security',
"http://localhost:9981/hello",
"http://localhost:9981/history",
"http://localhost:9981/plans",
"http://localhost:9981/team",
"http://localhost:9981/apps",
"http://localhost:9981/integrations",
"http://localhost:9981/terms",
"http://localhost:9981/privacy",
"http://localhost:9981/features",
"http://localhost:9981/why-zulip",
"http://localhost:9981/for/open-source",
"http://localhost:9981/for/companies",
"http://localhost:9981/for/working-groups-and-communities",
"http://localhost:9981/for/research",
"http://localhost:9981/security",
]
deny_domains: List[str] = []

View File

@@ -13,34 +13,34 @@ from twisted.python.failure import Failure
EXCLUDED_URLS = [
# Google calendar returns 404s on HEAD requests unconditionally
'https://calendar.google.com/calendar/embed?src=ktiduof4eoh47lmgcl2qunnc0o@group.calendar.google.com',
"https://calendar.google.com/calendar/embed?src=ktiduof4eoh47lmgcl2qunnc0o@group.calendar.google.com",
# Returns 409 errors to HEAD requests frequently
'https://medium.freecodecamp.org/',
"https://medium.freecodecamp.org/",
# Returns 404 to HEAD requests unconditionally
'https://www.git-tower.com/blog/command-line-cheat-sheet/',
'https://marketplace.visualstudio.com/items?itemName=rafaelmaiolla.remote-vscode',
"https://www.git-tower.com/blog/command-line-cheat-sheet/",
"https://marketplace.visualstudio.com/items?itemName=rafaelmaiolla.remote-vscode",
# Requires authentication
'https://circleci.com/gh/zulip/zulip/tree/master',
'https://circleci.com/gh/zulip/zulip/16617',
'https://www.linkedin.com/company/zulip-project',
"https://circleci.com/gh/zulip/zulip/tree/master",
"https://circleci.com/gh/zulip/zulip/16617",
"https://www.linkedin.com/company/zulip-project",
# Returns 403 errors to HEAD requests
'https://giphy.com',
'https://giphy.com/apps/giphycapture',
'https://www.udemy.com/course/the-complete-react-native-and-redux-course/',
"https://giphy.com",
"https://giphy.com/apps/giphycapture",
"https://www.udemy.com/course/the-complete-react-native-and-redux-course/",
]
VNU_IGNORE = [
# Real errors that should be fixed.
r'Duplicate ID “[^”]*”\.',
r'The first occurrence of ID “[^”]*” was here\.',
r'Attribute “markdown” not allowed on element “div” at this point\.',
r'No “p” element in scope but a “p” end tag seen\.',
r'Element “div” not allowed as child of element “ul” in this context\. '
+ r'\(Suppressing further errors from this subtree\.\)',
r"Duplicate ID “[^”]*”\.",
r"The first occurrence of ID “[^”]*” was here\.",
r"Attribute “markdown” not allowed on element “div” at this point\.",
r"No “p” element in scope but a “p” end tag seen\.",
r"Element “div” not allowed as child of element “ul” in this context\. "
+ r"\(Suppressing further errors from this subtree\.\)",
# Warnings that are probably less important.
r'The “type” attribute is unnecessary for JavaScript resources\.',
r"The “type” attribute is unnecessary for JavaScript resources\.",
]
VNU_IGNORE_REGEX = re.compile(r'|'.join(VNU_IGNORE))
VNU_IGNORE_REGEX = re.compile(r"|".join(VNU_IGNORE))
DEPLOY_ROOT = os.path.abspath(os.path.join(__file__, "../../../../../.."))
@@ -54,15 +54,15 @@ class BaseDocumentationSpider(scrapy.Spider):
deny_domains: List[str] = []
start_urls: List[str] = []
deny: List[str] = []
file_extensions: List[str] = ['.' + ext for ext in IGNORED_EXTENSIONS]
tags = ('a', 'area', 'img')
attrs = ('href', 'src')
file_extensions: List[str] = ["." + ext for ext in IGNORED_EXTENSIONS]
tags = ("a", "area", "img")
attrs = ("href", "src")
def _has_extension(self, url: str) -> bool:
return url_has_any_extension(url, self.file_extensions)
def _is_external_url(self, url: str) -> bool:
return url.startswith('http') or self._has_extension(url)
return url.startswith("http") or self._has_extension(url)
def check_existing(self, response: Response) -> None:
self.log(response)
@@ -85,7 +85,7 @@ class BaseDocumentationSpider(scrapy.Spider):
):
# We can verify these links directly in the local git repo without making any requests to GitHub servers.
return False
if 'github.com/zulip' in url:
if "github.com/zulip" in url:
# We want to check these links but due to rate limiting from GitHub, these checks often
# fail in the CI. Thus, we should treat these as external links for now.
# TODO: Figure out how to test github.com/zulip links in CI.
@@ -98,7 +98,7 @@ class BaseDocumentationSpider(scrapy.Spider):
m = re.match(r".+\#(?P<fragment>.*)$", response.request.url) # Get fragment value.
if not m:
return
fragment = m.group('fragment')
fragment = m.group("fragment")
# Check fragment existing on response page.
if not response.selector.xpath(xpath_template.format(fragment=fragment)):
self.logger.error(
@@ -108,17 +108,17 @@ class BaseDocumentationSpider(scrapy.Spider):
def _vnu_callback(self, url: str) -> Callable[[Response], None]:
def callback(response: Response) -> None:
vnu_out = json.loads(response.text)
for message in vnu_out['messages']:
if not VNU_IGNORE_REGEX.fullmatch(message['message']):
for message in vnu_out["messages"]:
if not VNU_IGNORE_REGEX.fullmatch(message["message"]):
self.logger.error(
'"%s":%d.%d-%d.%d: %s: %s',
url,
message.get('firstLine', message['lastLine']),
message.get('firstColumn', message['lastColumn']),
message['lastLine'],
message['lastColumn'],
message['type'],
message['message'],
message.get("firstLine", message["lastLine"]),
message.get("firstColumn", message["lastColumn"]),
message["lastLine"],
message["lastColumn"],
message["type"],
message["message"],
)
return callback
@@ -129,18 +129,18 @@ class BaseDocumentationSpider(scrapy.Spider):
# crawl documentation served by the webapp (E.g. /help/), we
# don't want to crawl the webapp itself, so we exclude these.
if (
url in ['http://localhost:9981/', 'http://localhost:9981']
or url.startswith('http://localhost:9981/#')
or url.startswith('http://localhost:9981#')
url in ["http://localhost:9981/", "http://localhost:9981"]
or url.startswith("http://localhost:9981/#")
or url.startswith("http://localhost:9981#")
):
return
callback: Callable[[Response], Optional[Iterator[Request]]] = self.parse
dont_filter = False
method = 'GET'
method = "GET"
if self._is_external_url(url):
callback = self.check_existing
method = 'HEAD'
method = "HEAD"
if url.startswith(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX):
file_path = url.replace(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX, DEPLOY_ROOT)
@@ -159,10 +159,10 @@ class BaseDocumentationSpider(scrapy.Spider):
"There is no local directory associated with the GitHub URL: %s", url
)
return
elif '#' in url:
elif "#" in url:
dont_filter = True
callback = self.check_fragment
if getattr(self, 'skip_external', False) and self._is_external_link(url):
if getattr(self, "skip_external", False) and self._is_external_link(url):
return
yield Request(
url,
@@ -179,11 +179,11 @@ class BaseDocumentationSpider(scrapy.Spider):
def parse(self, response: Response) -> Iterator[Request]:
self.log(response)
if getattr(self, 'validate_html', False):
if getattr(self, "validate_html", False):
yield Request(
'http://127.0.0.1:9988/?out=json',
method='POST',
headers={'Content-Type': response.headers['Content-Type']},
"http://127.0.0.1:9988/?out=json",
method="POST",
headers={"Content-Type": response.headers["Content-Type"]},
body=response.body,
callback=self._vnu_callback(response.url),
errback=self.error_callback,
@@ -191,7 +191,7 @@ class BaseDocumentationSpider(scrapy.Spider):
for link in LxmlLinkExtractor(
deny_domains=self.deny_domains,
deny_extensions=['doc'],
deny_extensions=["doc"],
tags=self.tags,
attrs=self.attrs,
deny=self.deny,
@@ -200,7 +200,7 @@ class BaseDocumentationSpider(scrapy.Spider):
yield from self._make_requests(link.url)
def retry_request_with_get(self, request: Request) -> Iterator[Request]:
request.method = 'GET'
request.method = "GET"
request.dont_filter = True
yield request
@@ -212,7 +212,7 @@ class BaseDocumentationSpider(scrapy.Spider):
response = failure.value.response
if self.exclude_error(response.url):
return None
if response.status == 405 and response.request.method == 'HEAD':
if response.status == 405 and response.request.method == "HEAD":
# Method 'HEAD' not allowed, repeat request with 'GET'
return self.retry_request_with_get(response.request)
self.logger.error("Please check link: %s", response.request.url)