mirror of
https://github.com/zulip/zulip.git
synced 2025-11-15 11:22:04 +00:00
documentation-crawler: Check images in help documentation.
This checks both that all images under static/images/help/ are used in the help documentation, and also that none of the image tags are broken. - Improve documentation spiders and crawler with spider error state. Fixes #3070.
This commit is contained in:
@@ -17,9 +17,7 @@ class StatusCommand(Command):
|
|||||||
self.crawler_process.crawl(crawler)
|
self.crawler_process.crawl(crawler)
|
||||||
self.crawler_process.start()
|
self.crawler_process.start()
|
||||||
# Get exceptions quantity from crawler stat data
|
# Get exceptions quantity from crawler stat data
|
||||||
stats = crawler.stats.get_stats()
|
|
||||||
error_404 = 'downloader/response_status_count/404'
|
if crawler.spider.has_error:
|
||||||
error_io = 'downloader/exception_type_count/exceptions.IOError'
|
|
||||||
if stats.get(error_404) or stats.get(error_io):
|
|
||||||
# Return non-zero exit code if exceptions are contained
|
# Return non-zero exit code if exceptions are contained
|
||||||
self.exitcode = 1
|
self.exitcode = 1
|
||||||
|
|||||||
@@ -1,16 +1,51 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from posixpath import basename
|
||||||
|
from six.moves.urllib.parse import urlparse
|
||||||
|
|
||||||
from .common.spiders import BaseDocumentationSpider
|
from .common.spiders import BaseDocumentationSpider
|
||||||
|
|
||||||
|
from typing import Any, List, Set
|
||||||
|
|
||||||
|
|
||||||
|
def get_help_images_dir(help_images_path):
|
||||||
|
# type: (str) -> str
|
||||||
|
# Get index html file as start url and convert it to file uri
|
||||||
|
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
target_path = os.path.join(dir_path, os.path.join(*[os.pardir] * 4), help_images_path)
|
||||||
|
return os.path.realpath(target_path)
|
||||||
|
|
||||||
|
|
||||||
class HelpDocumentationSpider(BaseDocumentationSpider):
|
class HelpDocumentationSpider(BaseDocumentationSpider):
|
||||||
name = "help_documentation_crawler"
|
name = "help_documentation_crawler"
|
||||||
start_urls = ['http://localhost:9981/help']
|
start_urls = ['http://localhost:9981/help']
|
||||||
deny_domains = [] # type: List[str]
|
deny_domains = [] # type: List[str]
|
||||||
deny = ['/privacy']
|
deny = ['/privacy']
|
||||||
|
help_images_path = "static/images/help"
|
||||||
|
help_images_static_dir = get_help_images_dir(help_images_path)
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
# type: (*Any, **Any) -> None
|
||||||
|
super(HelpDocumentationSpider, self).__init__(*args, **kwargs)
|
||||||
|
self.static_images = set() # type: Set
|
||||||
|
|
||||||
def _is_external_url(self, url):
|
def _is_external_url(self, url):
|
||||||
# type: (str) -> bool
|
# type: (str) -> bool
|
||||||
is_external = url.startswith('http') and 'localhost:9981/help' not in url
|
is_external = url.startswith('http') and 'localhost:9981/help' not in url
|
||||||
|
if self._has_extension(url) and 'localhost:9981/static/images/help' in url:
|
||||||
|
self.static_images.add(basename(urlparse(url).path))
|
||||||
return is_external or self._has_extension(url)
|
return is_external or self._has_extension(url)
|
||||||
|
|
||||||
|
def closed(self, *args, **kwargs):
|
||||||
|
# type: (*Any, **Any) -> None
|
||||||
|
unused_images = set(os.listdir(self.help_images_static_dir)) - self.static_images
|
||||||
|
if unused_images:
|
||||||
|
exception_message = "The following images are not used in help documentation " \
|
||||||
|
"and can be removed: {}"
|
||||||
|
self._set_error_state()
|
||||||
|
unused_images_relatedpath = [
|
||||||
|
os.path.join(self.help_images_path, img) for img in unused_images]
|
||||||
|
raise Exception(exception_message.format(', '.join(unused_images_relatedpath)))
|
||||||
|
|||||||
@@ -20,6 +20,17 @@ class BaseDocumentationSpider(scrapy.Spider):
|
|||||||
start_urls = [] # type: List[str]
|
start_urls = [] # type: List[str]
|
||||||
deny = () # type: Tuple
|
deny = () # type: Tuple
|
||||||
file_extensions = ['.' + ext for ext in IGNORED_EXTENSIONS] # type: List[str]
|
file_extensions = ['.' + ext for ext in IGNORED_EXTENSIONS] # type: List[str]
|
||||||
|
tags = ('a', 'area', 'img')
|
||||||
|
attrs = ('href', 'src')
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
# type: (*Any, **Any) -> None
|
||||||
|
super(BaseDocumentationSpider, self).__init__(*args, **kwargs)
|
||||||
|
self.has_error = False
|
||||||
|
|
||||||
|
def _set_error_state(self):
|
||||||
|
# type: () -> None
|
||||||
|
self.has_error = True
|
||||||
|
|
||||||
def _has_extension(self, url):
|
def _has_extension(self, url):
|
||||||
# type: (str) -> bool
|
# type: (str) -> bool
|
||||||
@@ -43,19 +54,19 @@ class BaseDocumentationSpider(scrapy.Spider):
|
|||||||
permalink = m.group('permalink')
|
permalink = m.group('permalink')
|
||||||
# Check permalink existing on response page.
|
# Check permalink existing on response page.
|
||||||
if not response.selector.xpath(xpath_template.format(permalink=permalink)):
|
if not response.selector.xpath(xpath_template.format(permalink=permalink)):
|
||||||
|
self._set_error_state()
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"Permalink #{} is not found on page {}".format(permalink, response.request.url))
|
"Permalink #{} is not found on page {}".format(permalink, response.request.url))
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
# type: (Any) -> Generator[Request, None, None]
|
# type: (Any) -> Generator[Request, None, None]
|
||||||
self.log(response)
|
self.log(response)
|
||||||
for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=[],
|
for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=['doc'],
|
||||||
deny=self.deny,
|
tags=self.tags, attrs=self.attrs, deny=self.deny,
|
||||||
canonicalize=False).extract_links(response):
|
canonicalize=False).extract_links(response):
|
||||||
callback = self.parse # type: Any
|
callback = self.parse # type: Any
|
||||||
dont_filter = False
|
dont_filter = False
|
||||||
method = 'GET'
|
method = 'GET'
|
||||||
|
|
||||||
if self._is_external_url(link.url):
|
if self._is_external_url(link.url):
|
||||||
callback = self.check_existing
|
callback = self.check_existing
|
||||||
method = 'HEAD'
|
method = 'HEAD'
|
||||||
@@ -76,10 +87,13 @@ class BaseDocumentationSpider(scrapy.Spider):
|
|||||||
if hasattr(failure.value, 'response') and failure.value.response:
|
if hasattr(failure.value, 'response') and failure.value.response:
|
||||||
response = failure.value.response
|
response = failure.value.response
|
||||||
if response.status == 404:
|
if response.status == 404:
|
||||||
|
self._set_error_state()
|
||||||
raise Exception('Page not found: {}'.format(response))
|
raise Exception('Page not found: {}'.format(response))
|
||||||
if response.status == 405 and response.request.method == 'HEAD':
|
if response.status == 405 and response.request.method == 'HEAD':
|
||||||
# Method 'HEAD' not allowed, repeat request with 'GET'
|
# Method 'HEAD' not allowed, repeat request with 'GET'
|
||||||
return self.retry_request_with_get(response.request)
|
return self.retry_request_with_get(response.request)
|
||||||
self.log("Error! Please check link: {}".format(response), logging.ERROR)
|
self.log("Error! Please check link: {}".format(response), logging.ERROR)
|
||||||
|
elif isinstance(failure.type, IOError):
|
||||||
|
self._set_error_state()
|
||||||
else:
|
else:
|
||||||
raise Exception(failure.value)
|
raise Exception(failure.value)
|
||||||
|
|||||||
Reference in New Issue
Block a user