mirror of
				https://github.com/zulip/zulip.git
				synced 2025-10-31 03:53:50 +00:00 
			
		
		
		
	python: Normalize quotes with Black.
Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
		
				
					committed by
					
						 Tim Abbott
						Tim Abbott
					
				
			
			
				
	
			
			
			
						parent
						
							11741543da
						
					
				
				
					commit
					6e4c3e41dc
				
			| @@ -7,20 +7,20 @@ | ||||
| #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||||
| #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||||
|  | ||||
| BOT_NAME = 'documentation_crawler' | ||||
| BOT_NAME = "documentation_crawler" | ||||
|  | ||||
| SPIDER_MODULES = ['documentation_crawler.spiders'] | ||||
| NEWSPIDER_MODULE = 'documentation_crawler.spiders' | ||||
| COMMANDS_MODULE = 'documentation_crawler.commands' | ||||
| LOG_LEVEL = 'WARNING' | ||||
| SPIDER_MODULES = ["documentation_crawler.spiders"] | ||||
| NEWSPIDER_MODULE = "documentation_crawler.spiders" | ||||
| COMMANDS_MODULE = "documentation_crawler.commands" | ||||
| LOG_LEVEL = "WARNING" | ||||
| DOWNLOAD_TIMEOUT = 15 | ||||
|  | ||||
|  | ||||
| # Crawl responsibly by identifying yourself (and your website) on the user-agent | ||||
| USER_AGENT = ( | ||||
|     'Mozilla/5.0 (X11; Linux x86_64) ' | ||||
|     'AppleWebKit/537.36 (KHTML, like Gecko) ' | ||||
|     'Chrome/54.0.2840.59 Safari/537.36' | ||||
|     "Mozilla/5.0 (X11; Linux x86_64) " | ||||
|     "AppleWebKit/537.36 (KHTML, like Gecko) " | ||||
|     "Chrome/54.0.2840.59 Safari/537.36" | ||||
| ) | ||||
|  | ||||
| # Obey robots.txt rules | ||||
|   | ||||
| @@ -18,6 +18,6 @@ def get_start_url() -> List[str]: | ||||
|  | ||||
| class DocumentationSpider(BaseDocumentationSpider): | ||||
|     name = "documentation_crawler" | ||||
|     deny_domains = ['localhost:9991'] | ||||
|     deny = [r'\_sources\/.*\.txt'] | ||||
|     deny_domains = ["localhost:9991"] | ||||
|     deny = [r"\_sources\/.*\.txt"] | ||||
|     start_urls = get_start_url() | ||||
|   | ||||
| @@ -22,8 +22,8 @@ class UnusedImagesLinterSpider(BaseDocumentationSpider): | ||||
|         self.images_static_dir: str = get_images_dir(self.images_path) | ||||
|  | ||||
|     def _is_external_url(self, url: str) -> bool: | ||||
|         is_external = url.startswith('http') and self.start_urls[0] not in url | ||||
|         if self._has_extension(url) and f'localhost:9981/{self.images_path}' in url: | ||||
|         is_external = url.startswith("http") and self.start_urls[0] not in url | ||||
|         if self._has_extension(url) and f"localhost:9981/{self.images_path}" in url: | ||||
|             self.static_images.add(basename(urlparse(url).path)) | ||||
|         return is_external or self._has_extension(url) | ||||
|  | ||||
| @@ -37,20 +37,20 @@ class UnusedImagesLinterSpider(BaseDocumentationSpider): | ||||
|             unused_images_relatedpath = [ | ||||
|                 os.path.join(self.images_path, img) for img in unused_images | ||||
|             ] | ||||
|             raise Exception(exception_message.format(', '.join(unused_images_relatedpath))) | ||||
|             raise Exception(exception_message.format(", ".join(unused_images_relatedpath))) | ||||
|  | ||||
|  | ||||
| class HelpDocumentationSpider(UnusedImagesLinterSpider): | ||||
|     name = "help_documentation_crawler" | ||||
|     start_urls = ['http://localhost:9981/help'] | ||||
|     start_urls = ["http://localhost:9981/help"] | ||||
|     deny_domains: List[str] = [] | ||||
|     deny = ['/privacy'] | ||||
|     deny = ["/privacy"] | ||||
|     images_path = "static/images/help" | ||||
|  | ||||
|  | ||||
| class APIDocumentationSpider(UnusedImagesLinterSpider): | ||||
|     name = 'api_documentation_crawler' | ||||
|     start_urls = ['http://localhost:9981/api'] | ||||
|     name = "api_documentation_crawler" | ||||
|     start_urls = ["http://localhost:9981/api"] | ||||
|     deny_domains: List[str] = [] | ||||
|     images_path = "static/images/api" | ||||
|  | ||||
| @@ -58,28 +58,28 @@ class APIDocumentationSpider(UnusedImagesLinterSpider): | ||||
| class PorticoDocumentationSpider(BaseDocumentationSpider): | ||||
|     def _is_external_url(self, url: str) -> bool: | ||||
|         return ( | ||||
|             not url.startswith('http://localhost:9981') | ||||
|             or url.startswith('http://localhost:9981/help') | ||||
|             or url.startswith('http://localhost:9981/api') | ||||
|             not url.startswith("http://localhost:9981") | ||||
|             or url.startswith("http://localhost:9981/help") | ||||
|             or url.startswith("http://localhost:9981/api") | ||||
|             or self._has_extension(url) | ||||
|         ) | ||||
|  | ||||
|     name = 'portico_documentation_crawler' | ||||
|     name = "portico_documentation_crawler" | ||||
|     start_urls = [ | ||||
|         'http://localhost:9981/hello', | ||||
|         'http://localhost:9981/history', | ||||
|         'http://localhost:9981/plans', | ||||
|         'http://localhost:9981/team', | ||||
|         'http://localhost:9981/apps', | ||||
|         'http://localhost:9981/integrations', | ||||
|         'http://localhost:9981/terms', | ||||
|         'http://localhost:9981/privacy', | ||||
|         'http://localhost:9981/features', | ||||
|         'http://localhost:9981/why-zulip', | ||||
|         'http://localhost:9981/for/open-source', | ||||
|         'http://localhost:9981/for/companies', | ||||
|         'http://localhost:9981/for/working-groups-and-communities', | ||||
|         'http://localhost:9981/for/research', | ||||
|         'http://localhost:9981/security', | ||||
|         "http://localhost:9981/hello", | ||||
|         "http://localhost:9981/history", | ||||
|         "http://localhost:9981/plans", | ||||
|         "http://localhost:9981/team", | ||||
|         "http://localhost:9981/apps", | ||||
|         "http://localhost:9981/integrations", | ||||
|         "http://localhost:9981/terms", | ||||
|         "http://localhost:9981/privacy", | ||||
|         "http://localhost:9981/features", | ||||
|         "http://localhost:9981/why-zulip", | ||||
|         "http://localhost:9981/for/open-source", | ||||
|         "http://localhost:9981/for/companies", | ||||
|         "http://localhost:9981/for/working-groups-and-communities", | ||||
|         "http://localhost:9981/for/research", | ||||
|         "http://localhost:9981/security", | ||||
|     ] | ||||
|     deny_domains: List[str] = [] | ||||
|   | ||||
| @@ -13,34 +13,34 @@ from twisted.python.failure import Failure | ||||
|  | ||||
| EXCLUDED_URLS = [ | ||||
|     # Google calendar returns 404s on HEAD requests unconditionally | ||||
|     'https://calendar.google.com/calendar/embed?src=ktiduof4eoh47lmgcl2qunnc0o@group.calendar.google.com', | ||||
|     "https://calendar.google.com/calendar/embed?src=ktiduof4eoh47lmgcl2qunnc0o@group.calendar.google.com", | ||||
|     # Returns 409 errors to HEAD requests frequently | ||||
|     'https://medium.freecodecamp.org/', | ||||
|     "https://medium.freecodecamp.org/", | ||||
|     # Returns 404 to HEAD requests unconditionally | ||||
|     'https://www.git-tower.com/blog/command-line-cheat-sheet/', | ||||
|     'https://marketplace.visualstudio.com/items?itemName=rafaelmaiolla.remote-vscode', | ||||
|     "https://www.git-tower.com/blog/command-line-cheat-sheet/", | ||||
|     "https://marketplace.visualstudio.com/items?itemName=rafaelmaiolla.remote-vscode", | ||||
|     # Requires authentication | ||||
|     'https://circleci.com/gh/zulip/zulip/tree/master', | ||||
|     'https://circleci.com/gh/zulip/zulip/16617', | ||||
|     'https://www.linkedin.com/company/zulip-project', | ||||
|     "https://circleci.com/gh/zulip/zulip/tree/master", | ||||
|     "https://circleci.com/gh/zulip/zulip/16617", | ||||
|     "https://www.linkedin.com/company/zulip-project", | ||||
|     # Returns 403 errors to HEAD requests | ||||
|     'https://giphy.com', | ||||
|     'https://giphy.com/apps/giphycapture', | ||||
|     'https://www.udemy.com/course/the-complete-react-native-and-redux-course/', | ||||
|     "https://giphy.com", | ||||
|     "https://giphy.com/apps/giphycapture", | ||||
|     "https://www.udemy.com/course/the-complete-react-native-and-redux-course/", | ||||
| ] | ||||
|  | ||||
| VNU_IGNORE = [ | ||||
|     # Real errors that should be fixed. | ||||
|     r'Duplicate ID “[^”]*”\.', | ||||
|     r'The first occurrence of ID “[^”]*” was here\.', | ||||
|     r'Attribute “markdown” not allowed on element “div” at this point\.', | ||||
|     r'No “p” element in scope but a “p” end tag seen\.', | ||||
|     r'Element “div” not allowed as child of element “ul” in this context\. ' | ||||
|     + r'\(Suppressing further errors from this subtree\.\)', | ||||
|     r"Duplicate ID “[^”]*”\.", | ||||
|     r"The first occurrence of ID “[^”]*” was here\.", | ||||
|     r"Attribute “markdown” not allowed on element “div” at this point\.", | ||||
|     r"No “p” element in scope but a “p” end tag seen\.", | ||||
|     r"Element “div” not allowed as child of element “ul” in this context\. " | ||||
|     + r"\(Suppressing further errors from this subtree\.\)", | ||||
|     # Warnings that are probably less important. | ||||
|     r'The “type” attribute is unnecessary for JavaScript resources\.', | ||||
|     r"The “type” attribute is unnecessary for JavaScript resources\.", | ||||
| ] | ||||
| VNU_IGNORE_REGEX = re.compile(r'|'.join(VNU_IGNORE)) | ||||
| VNU_IGNORE_REGEX = re.compile(r"|".join(VNU_IGNORE)) | ||||
|  | ||||
| DEPLOY_ROOT = os.path.abspath(os.path.join(__file__, "../../../../../..")) | ||||
|  | ||||
| @@ -54,15 +54,15 @@ class BaseDocumentationSpider(scrapy.Spider): | ||||
|     deny_domains: List[str] = [] | ||||
|     start_urls: List[str] = [] | ||||
|     deny: List[str] = [] | ||||
|     file_extensions: List[str] = ['.' + ext for ext in IGNORED_EXTENSIONS] | ||||
|     tags = ('a', 'area', 'img') | ||||
|     attrs = ('href', 'src') | ||||
|     file_extensions: List[str] = ["." + ext for ext in IGNORED_EXTENSIONS] | ||||
|     tags = ("a", "area", "img") | ||||
|     attrs = ("href", "src") | ||||
|  | ||||
|     def _has_extension(self, url: str) -> bool: | ||||
|         return url_has_any_extension(url, self.file_extensions) | ||||
|  | ||||
|     def _is_external_url(self, url: str) -> bool: | ||||
|         return url.startswith('http') or self._has_extension(url) | ||||
|         return url.startswith("http") or self._has_extension(url) | ||||
|  | ||||
|     def check_existing(self, response: Response) -> None: | ||||
|         self.log(response) | ||||
| @@ -85,7 +85,7 @@ class BaseDocumentationSpider(scrapy.Spider): | ||||
|         ): | ||||
|             # We can verify these links directly in the local git repo without making any requests to GitHub servers. | ||||
|             return False | ||||
|         if 'github.com/zulip' in url: | ||||
|         if "github.com/zulip" in url: | ||||
|             # We want to check these links but due to rate limiting from GitHub, these checks often | ||||
|             # fail in the CI. Thus, we should treat these as external links for now. | ||||
|             # TODO: Figure out how to test github.com/zulip links in CI. | ||||
| @@ -98,7 +98,7 @@ class BaseDocumentationSpider(scrapy.Spider): | ||||
|         m = re.match(r".+\#(?P<fragment>.*)$", response.request.url)  # Get fragment value. | ||||
|         if not m: | ||||
|             return | ||||
|         fragment = m.group('fragment') | ||||
|         fragment = m.group("fragment") | ||||
|         # Check fragment existing on response page. | ||||
|         if not response.selector.xpath(xpath_template.format(fragment=fragment)): | ||||
|             self.logger.error( | ||||
| @@ -108,17 +108,17 @@ class BaseDocumentationSpider(scrapy.Spider): | ||||
|     def _vnu_callback(self, url: str) -> Callable[[Response], None]: | ||||
|         def callback(response: Response) -> None: | ||||
|             vnu_out = json.loads(response.text) | ||||
|             for message in vnu_out['messages']: | ||||
|                 if not VNU_IGNORE_REGEX.fullmatch(message['message']): | ||||
|             for message in vnu_out["messages"]: | ||||
|                 if not VNU_IGNORE_REGEX.fullmatch(message["message"]): | ||||
|                     self.logger.error( | ||||
|                         '"%s":%d.%d-%d.%d: %s: %s', | ||||
|                         url, | ||||
|                         message.get('firstLine', message['lastLine']), | ||||
|                         message.get('firstColumn', message['lastColumn']), | ||||
|                         message['lastLine'], | ||||
|                         message['lastColumn'], | ||||
|                         message['type'], | ||||
|                         message['message'], | ||||
|                         message.get("firstLine", message["lastLine"]), | ||||
|                         message.get("firstColumn", message["lastColumn"]), | ||||
|                         message["lastLine"], | ||||
|                         message["lastColumn"], | ||||
|                         message["type"], | ||||
|                         message["message"], | ||||
|                     ) | ||||
|  | ||||
|         return callback | ||||
| @@ -129,18 +129,18 @@ class BaseDocumentationSpider(scrapy.Spider): | ||||
|         # crawl documentation served by the webapp (E.g. /help/), we | ||||
|         # don't want to crawl the webapp itself, so we exclude these. | ||||
|         if ( | ||||
|             url in ['http://localhost:9981/', 'http://localhost:9981'] | ||||
|             or url.startswith('http://localhost:9981/#') | ||||
|             or url.startswith('http://localhost:9981#') | ||||
|             url in ["http://localhost:9981/", "http://localhost:9981"] | ||||
|             or url.startswith("http://localhost:9981/#") | ||||
|             or url.startswith("http://localhost:9981#") | ||||
|         ): | ||||
|             return | ||||
|  | ||||
|         callback: Callable[[Response], Optional[Iterator[Request]]] = self.parse | ||||
|         dont_filter = False | ||||
|         method = 'GET' | ||||
|         method = "GET" | ||||
|         if self._is_external_url(url): | ||||
|             callback = self.check_existing | ||||
|             method = 'HEAD' | ||||
|             method = "HEAD" | ||||
|  | ||||
|             if url.startswith(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX): | ||||
|                 file_path = url.replace(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX, DEPLOY_ROOT) | ||||
| @@ -159,10 +159,10 @@ class BaseDocumentationSpider(scrapy.Spider): | ||||
|                         "There is no local directory associated with the GitHub URL: %s", url | ||||
|                     ) | ||||
|                 return | ||||
|         elif '#' in url: | ||||
|         elif "#" in url: | ||||
|             dont_filter = True | ||||
|             callback = self.check_fragment | ||||
|         if getattr(self, 'skip_external', False) and self._is_external_link(url): | ||||
|         if getattr(self, "skip_external", False) and self._is_external_link(url): | ||||
|             return | ||||
|         yield Request( | ||||
|             url, | ||||
| @@ -179,11 +179,11 @@ class BaseDocumentationSpider(scrapy.Spider): | ||||
|     def parse(self, response: Response) -> Iterator[Request]: | ||||
|         self.log(response) | ||||
|  | ||||
|         if getattr(self, 'validate_html', False): | ||||
|         if getattr(self, "validate_html", False): | ||||
|             yield Request( | ||||
|                 'http://127.0.0.1:9988/?out=json', | ||||
|                 method='POST', | ||||
|                 headers={'Content-Type': response.headers['Content-Type']}, | ||||
|                 "http://127.0.0.1:9988/?out=json", | ||||
|                 method="POST", | ||||
|                 headers={"Content-Type": response.headers["Content-Type"]}, | ||||
|                 body=response.body, | ||||
|                 callback=self._vnu_callback(response.url), | ||||
|                 errback=self.error_callback, | ||||
| @@ -191,7 +191,7 @@ class BaseDocumentationSpider(scrapy.Spider): | ||||
|  | ||||
|         for link in LxmlLinkExtractor( | ||||
|             deny_domains=self.deny_domains, | ||||
|             deny_extensions=['doc'], | ||||
|             deny_extensions=["doc"], | ||||
|             tags=self.tags, | ||||
|             attrs=self.attrs, | ||||
|             deny=self.deny, | ||||
| @@ -200,7 +200,7 @@ class BaseDocumentationSpider(scrapy.Spider): | ||||
|             yield from self._make_requests(link.url) | ||||
|  | ||||
|     def retry_request_with_get(self, request: Request) -> Iterator[Request]: | ||||
|         request.method = 'GET' | ||||
|         request.method = "GET" | ||||
|         request.dont_filter = True | ||||
|         yield request | ||||
|  | ||||
| @@ -212,7 +212,7 @@ class BaseDocumentationSpider(scrapy.Spider): | ||||
|             response = failure.value.response | ||||
|             if self.exclude_error(response.url): | ||||
|                 return None | ||||
|             if response.status == 405 and response.request.method == 'HEAD': | ||||
|             if response.status == 405 and response.request.method == "HEAD": | ||||
|                 # Method 'HEAD' not allowed, repeat request with 'GET' | ||||
|                 return self.retry_request_with_get(response.request) | ||||
|             self.logger.error("Please check link: %s", response.request.url) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user