mirror of
https://github.com/zulip/zulip.git
synced 2025-11-07 15:33:30 +00:00
python: Reformat with Black, except quotes.
Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
committed by
Tim Abbott
parent
5028c081cb
commit
11741543da
@@ -17,77 +17,79 @@ DOWNLOAD_TIMEOUT = 15
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = ('Mozilla/5.0 (X11; Linux x86_64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/54.0.2840.59 Safari/537.36')
|
||||
USER_AGENT = (
|
||||
'Mozilla/5.0 (X11; Linux x86_64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/54.0.2840.59 Safari/537.36'
|
||||
)
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
# CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
# COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
# TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
# }
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# SPIDER_MIDDLEWARES = {
|
||||
# 'documentation_crawler.middlewares.MyCustomSpiderMiddleware': 543,
|
||||
#}
|
||||
# }
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# DOWNLOADER_MIDDLEWARES = {
|
||||
# 'documentation_crawler.middlewares.MyCustomDownloaderMiddleware': 543,
|
||||
#}
|
||||
# }
|
||||
|
||||
# Enable or disable extensions
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
# }
|
||||
|
||||
# Configure item pipelines
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
# ITEM_PIPELINES = {
|
||||
# 'documentation_crawler.pipelines.SomePipeline': 300,
|
||||
#}
|
||||
# }
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
# AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
# HTTPCACHE_ENABLED = True
|
||||
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||
# HTTPCACHE_DIR = 'httpcache'
|
||||
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
@@ -8,8 +8,9 @@ from .common.spiders import BaseDocumentationSpider
|
||||
def get_start_url() -> List[str]:
|
||||
# Get index.html file as start URL and convert it to file URI
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
start_file = os.path.join(dir_path, os.path.join(*[os.pardir] * 4),
|
||||
"docs/_build/html/index.html")
|
||||
start_file = os.path.join(
|
||||
dir_path, os.path.join(*[os.pardir] * 4), "docs/_build/html/index.html"
|
||||
)
|
||||
return [
|
||||
pathlib.Path(os.path.abspath(start_file)).as_uri(),
|
||||
]
|
||||
|
||||
@@ -30,10 +30,13 @@ class UnusedImagesLinterSpider(BaseDocumentationSpider):
|
||||
def closed(self, *args: Any, **kwargs: Any) -> None:
|
||||
unused_images = set(os.listdir(self.images_static_dir)) - self.static_images
|
||||
if unused_images:
|
||||
exception_message = "The following images are not used in documentation and can be removed: {}"
|
||||
exception_message = (
|
||||
"The following images are not used in documentation and can be removed: {}"
|
||||
)
|
||||
self._set_error_state()
|
||||
unused_images_relatedpath = [
|
||||
os.path.join(self.images_path, img) for img in unused_images]
|
||||
os.path.join(self.images_path, img) for img in unused_images
|
||||
]
|
||||
raise Exception(exception_message.format(', '.join(unused_images_relatedpath)))
|
||||
|
||||
|
||||
@@ -51,6 +54,7 @@ class APIDocumentationSpider(UnusedImagesLinterSpider):
|
||||
deny_domains: List[str] = []
|
||||
images_path = "static/images/api"
|
||||
|
||||
|
||||
class PorticoDocumentationSpider(BaseDocumentationSpider):
|
||||
def _is_external_url(self, url: str) -> bool:
|
||||
return (
|
||||
@@ -61,19 +65,21 @@ class PorticoDocumentationSpider(BaseDocumentationSpider):
|
||||
)
|
||||
|
||||
name = 'portico_documentation_crawler'
|
||||
start_urls = ['http://localhost:9981/hello',
|
||||
'http://localhost:9981/history',
|
||||
'http://localhost:9981/plans',
|
||||
'http://localhost:9981/team',
|
||||
'http://localhost:9981/apps',
|
||||
'http://localhost:9981/integrations',
|
||||
'http://localhost:9981/terms',
|
||||
'http://localhost:9981/privacy',
|
||||
'http://localhost:9981/features',
|
||||
'http://localhost:9981/why-zulip',
|
||||
'http://localhost:9981/for/open-source',
|
||||
'http://localhost:9981/for/companies',
|
||||
'http://localhost:9981/for/working-groups-and-communities',
|
||||
'http://localhost:9981/for/research',
|
||||
'http://localhost:9981/security']
|
||||
start_urls = [
|
||||
'http://localhost:9981/hello',
|
||||
'http://localhost:9981/history',
|
||||
'http://localhost:9981/plans',
|
||||
'http://localhost:9981/team',
|
||||
'http://localhost:9981/apps',
|
||||
'http://localhost:9981/integrations',
|
||||
'http://localhost:9981/terms',
|
||||
'http://localhost:9981/privacy',
|
||||
'http://localhost:9981/features',
|
||||
'http://localhost:9981/why-zulip',
|
||||
'http://localhost:9981/for/open-source',
|
||||
'http://localhost:9981/for/companies',
|
||||
'http://localhost:9981/for/working-groups-and-communities',
|
||||
'http://localhost:9981/for/research',
|
||||
'http://localhost:9981/security',
|
||||
]
|
||||
deny_domains: List[str] = []
|
||||
|
||||
@@ -37,7 +37,6 @@ VNU_IGNORE = [
|
||||
r'No “p” element in scope but a “p” end tag seen\.',
|
||||
r'Element “div” not allowed as child of element “ul” in this context\. '
|
||||
+ r'\(Suppressing further errors from this subtree\.\)',
|
||||
|
||||
# Warnings that are probably less important.
|
||||
r'The “type” attribute is unnecessary for JavaScript resources\.',
|
||||
]
|
||||
@@ -48,6 +47,7 @@ DEPLOY_ROOT = os.path.abspath(os.path.join(__file__, "../../../../../.."))
|
||||
ZULIP_SERVER_GITHUB_FILE_URL_PREFIX = "https://github.com/zulip/zulip/blob/master"
|
||||
ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX = "https://github.com/zulip/zulip/tree/master"
|
||||
|
||||
|
||||
class BaseDocumentationSpider(scrapy.Spider):
|
||||
name: Optional[str] = None
|
||||
# Exclude domain address.
|
||||
@@ -80,7 +80,9 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
if (len(url) > 4 and url[:4] == "file") or ("localhost" in url):
|
||||
# We also want CI to check any links to built documentation.
|
||||
return False
|
||||
if url.startswith(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX) or url.startswith(ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX):
|
||||
if url.startswith(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX) or url.startswith(
|
||||
ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX
|
||||
):
|
||||
# We can verify these links directly in the local git repo without making any requests to GitHub servers.
|
||||
return False
|
||||
if 'github.com/zulip' in url:
|
||||
@@ -100,7 +102,8 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
# Check fragment existing on response page.
|
||||
if not response.selector.xpath(xpath_template.format(fragment=fragment)):
|
||||
self.logger.error(
|
||||
"Fragment #%s is not found on page %s", fragment, response.request.url)
|
||||
"Fragment #%s is not found on page %s", fragment, response.request.url
|
||||
)
|
||||
|
||||
def _vnu_callback(self, url: str) -> Callable[[Response], None]:
|
||||
def callback(response: Response) -> None:
|
||||
@@ -125,7 +128,11 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
# can be accessible without login an account. While we do
|
||||
# crawl documentation served by the webapp (E.g. /help/), we
|
||||
# don't want to crawl the webapp itself, so we exclude these.
|
||||
if url in ['http://localhost:9981/', 'http://localhost:9981'] or url.startswith('http://localhost:9981/#') or url.startswith('http://localhost:9981#'):
|
||||
if (
|
||||
url in ['http://localhost:9981/', 'http://localhost:9981']
|
||||
or url.startswith('http://localhost:9981/#')
|
||||
or url.startswith('http://localhost:9981#')
|
||||
):
|
||||
return
|
||||
|
||||
callback: Callable[[Response], Optional[Iterator[Request]]] = self.parse
|
||||
@@ -141,20 +148,29 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
if hash_index != -1:
|
||||
file_path = file_path[:hash_index]
|
||||
if not os.path.isfile(file_path):
|
||||
self.logger.error("There is no local file associated with the GitHub URL: %s", url)
|
||||
self.logger.error(
|
||||
"There is no local file associated with the GitHub URL: %s", url
|
||||
)
|
||||
return
|
||||
elif url.startswith(ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX):
|
||||
dir_path = url.replace(ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX, DEPLOY_ROOT)
|
||||
if not os.path.isdir(dir_path):
|
||||
self.logger.error("There is no local directory associated with the GitHub URL: %s", url)
|
||||
self.logger.error(
|
||||
"There is no local directory associated with the GitHub URL: %s", url
|
||||
)
|
||||
return
|
||||
elif '#' in url:
|
||||
dont_filter = True
|
||||
callback = self.check_fragment
|
||||
if getattr(self, 'skip_external', False) and self._is_external_link(url):
|
||||
return
|
||||
yield Request(url, method=method, callback=callback, dont_filter=dont_filter,
|
||||
errback=self.error_callback)
|
||||
yield Request(
|
||||
url,
|
||||
method=method,
|
||||
callback=callback,
|
||||
dont_filter=dont_filter,
|
||||
errback=self.error_callback,
|
||||
)
|
||||
|
||||
def start_requests(self) -> Iterator[Request]:
|
||||
for url in self.start_urls:
|
||||
@@ -173,9 +189,14 @@ class BaseDocumentationSpider(scrapy.Spider):
|
||||
errback=self.error_callback,
|
||||
)
|
||||
|
||||
for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=['doc'],
|
||||
tags=self.tags, attrs=self.attrs, deny=self.deny,
|
||||
canonicalize=False).extract_links(response):
|
||||
for link in LxmlLinkExtractor(
|
||||
deny_domains=self.deny_domains,
|
||||
deny_extensions=['doc'],
|
||||
tags=self.tags,
|
||||
attrs=self.attrs,
|
||||
deny=self.deny,
|
||||
canonicalize=False,
|
||||
).extract_links(response):
|
||||
yield from self._make_requests(link.url)
|
||||
|
||||
def retry_request_with_get(self, request: Request) -> Iterator[Request]:
|
||||
|
||||
Reference in New Issue
Block a user