python: Reformat with Black, except quotes.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
Anders Kaseorg
2021-02-11 23:19:30 -08:00
committed by Tim Abbott
parent 5028c081cb
commit 11741543da
817 changed files with 44952 additions and 24860 deletions

View File

@@ -17,77 +17,79 @@ DOWNLOAD_TIMEOUT = 15
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = ('Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/54.0.2840.59 Safari/537.36')
USER_AGENT = (
'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/54.0.2840.59 Safari/537.36'
)
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# }
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# SPIDER_MIDDLEWARES = {
# 'documentation_crawler.middlewares.MyCustomSpiderMiddleware': 543,
#}
# }
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# DOWNLOADER_MIDDLEWARES = {
# 'documentation_crawler.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# }
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# }
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# ITEM_PIPELINES = {
# 'documentation_crawler.pipelines.SomePipeline': 300,
#}
# }
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View File

@@ -8,8 +8,9 @@ from .common.spiders import BaseDocumentationSpider
def get_start_url() -> List[str]:
# Get index.html file as start URL and convert it to file URI
dir_path = os.path.dirname(os.path.realpath(__file__))
start_file = os.path.join(dir_path, os.path.join(*[os.pardir] * 4),
"docs/_build/html/index.html")
start_file = os.path.join(
dir_path, os.path.join(*[os.pardir] * 4), "docs/_build/html/index.html"
)
return [
pathlib.Path(os.path.abspath(start_file)).as_uri(),
]

View File

@@ -30,10 +30,13 @@ class UnusedImagesLinterSpider(BaseDocumentationSpider):
def closed(self, *args: Any, **kwargs: Any) -> None:
unused_images = set(os.listdir(self.images_static_dir)) - self.static_images
if unused_images:
exception_message = "The following images are not used in documentation and can be removed: {}"
exception_message = (
"The following images are not used in documentation and can be removed: {}"
)
self._set_error_state()
unused_images_relatedpath = [
os.path.join(self.images_path, img) for img in unused_images]
os.path.join(self.images_path, img) for img in unused_images
]
raise Exception(exception_message.format(', '.join(unused_images_relatedpath)))
@@ -51,6 +54,7 @@ class APIDocumentationSpider(UnusedImagesLinterSpider):
deny_domains: List[str] = []
images_path = "static/images/api"
class PorticoDocumentationSpider(BaseDocumentationSpider):
def _is_external_url(self, url: str) -> bool:
return (
@@ -61,19 +65,21 @@ class PorticoDocumentationSpider(BaseDocumentationSpider):
)
name = 'portico_documentation_crawler'
start_urls = ['http://localhost:9981/hello',
'http://localhost:9981/history',
'http://localhost:9981/plans',
'http://localhost:9981/team',
'http://localhost:9981/apps',
'http://localhost:9981/integrations',
'http://localhost:9981/terms',
'http://localhost:9981/privacy',
'http://localhost:9981/features',
'http://localhost:9981/why-zulip',
'http://localhost:9981/for/open-source',
'http://localhost:9981/for/companies',
'http://localhost:9981/for/working-groups-and-communities',
'http://localhost:9981/for/research',
'http://localhost:9981/security']
start_urls = [
'http://localhost:9981/hello',
'http://localhost:9981/history',
'http://localhost:9981/plans',
'http://localhost:9981/team',
'http://localhost:9981/apps',
'http://localhost:9981/integrations',
'http://localhost:9981/terms',
'http://localhost:9981/privacy',
'http://localhost:9981/features',
'http://localhost:9981/why-zulip',
'http://localhost:9981/for/open-source',
'http://localhost:9981/for/companies',
'http://localhost:9981/for/working-groups-and-communities',
'http://localhost:9981/for/research',
'http://localhost:9981/security',
]
deny_domains: List[str] = []

View File

@@ -37,7 +37,6 @@ VNU_IGNORE = [
r'No “p” element in scope but a “p” end tag seen\.',
r'Element “div” not allowed as child of element “ul” in this context\. '
+ r'\(Suppressing further errors from this subtree\.\)',
# Warnings that are probably less important.
r'The “type” attribute is unnecessary for JavaScript resources\.',
]
@@ -48,6 +47,7 @@ DEPLOY_ROOT = os.path.abspath(os.path.join(__file__, "../../../../../.."))
ZULIP_SERVER_GITHUB_FILE_URL_PREFIX = "https://github.com/zulip/zulip/blob/master"
ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX = "https://github.com/zulip/zulip/tree/master"
class BaseDocumentationSpider(scrapy.Spider):
name: Optional[str] = None
# Exclude domain address.
@@ -80,7 +80,9 @@ class BaseDocumentationSpider(scrapy.Spider):
if (len(url) > 4 and url[:4] == "file") or ("localhost" in url):
# We also want CI to check any links to built documentation.
return False
if url.startswith(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX) or url.startswith(ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX):
if url.startswith(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX) or url.startswith(
ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX
):
# We can verify these links directly in the local git repo without making any requests to GitHub servers.
return False
if 'github.com/zulip' in url:
@@ -100,7 +102,8 @@ class BaseDocumentationSpider(scrapy.Spider):
# Check fragment existing on response page.
if not response.selector.xpath(xpath_template.format(fragment=fragment)):
self.logger.error(
"Fragment #%s is not found on page %s", fragment, response.request.url)
"Fragment #%s is not found on page %s", fragment, response.request.url
)
def _vnu_callback(self, url: str) -> Callable[[Response], None]:
def callback(response: Response) -> None:
@@ -125,7 +128,11 @@ class BaseDocumentationSpider(scrapy.Spider):
# can be accessible without login an account. While we do
# crawl documentation served by the webapp (E.g. /help/), we
# don't want to crawl the webapp itself, so we exclude these.
if url in ['http://localhost:9981/', 'http://localhost:9981'] or url.startswith('http://localhost:9981/#') or url.startswith('http://localhost:9981#'):
if (
url in ['http://localhost:9981/', 'http://localhost:9981']
or url.startswith('http://localhost:9981/#')
or url.startswith('http://localhost:9981#')
):
return
callback: Callable[[Response], Optional[Iterator[Request]]] = self.parse
@@ -141,20 +148,29 @@ class BaseDocumentationSpider(scrapy.Spider):
if hash_index != -1:
file_path = file_path[:hash_index]
if not os.path.isfile(file_path):
self.logger.error("There is no local file associated with the GitHub URL: %s", url)
self.logger.error(
"There is no local file associated with the GitHub URL: %s", url
)
return
elif url.startswith(ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX):
dir_path = url.replace(ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX, DEPLOY_ROOT)
if not os.path.isdir(dir_path):
self.logger.error("There is no local directory associated with the GitHub URL: %s", url)
self.logger.error(
"There is no local directory associated with the GitHub URL: %s", url
)
return
elif '#' in url:
dont_filter = True
callback = self.check_fragment
if getattr(self, 'skip_external', False) and self._is_external_link(url):
return
yield Request(url, method=method, callback=callback, dont_filter=dont_filter,
errback=self.error_callback)
yield Request(
url,
method=method,
callback=callback,
dont_filter=dont_filter,
errback=self.error_callback,
)
def start_requests(self) -> Iterator[Request]:
for url in self.start_urls:
@@ -173,9 +189,14 @@ class BaseDocumentationSpider(scrapy.Spider):
errback=self.error_callback,
)
for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=['doc'],
tags=self.tags, attrs=self.attrs, deny=self.deny,
canonicalize=False).extract_links(response):
for link in LxmlLinkExtractor(
deny_domains=self.deny_domains,
deny_extensions=['doc'],
tags=self.tags,
attrs=self.attrs,
deny=self.deny,
canonicalize=False,
).extract_links(response):
yield from self._make_requests(link.url)
def retry_request_with_get(self, request: Request) -> Iterator[Request]: