documentation-crawler: Check images in help documentation.

This checks both that all images under static/images/help/ are used in
the help documentation, and also that none of the image tags are broken.

- Improve documentation spiders and crawler with spider error state.

Fixes #3070.
This commit is contained in:
K.Kanakhin
2017-01-10 15:09:24 +06:00
committed by Tim Abbott
parent 9fecd85e4a
commit aaf82ae090
3 changed files with 60 additions and 13 deletions

View File

@@ -17,9 +17,7 @@ class StatusCommand(Command):
self.crawler_process.crawl(crawler)
self.crawler_process.start()
# Get exceptions quantity from crawler stat data
stats = crawler.stats.get_stats()
error_404 = 'downloader/response_status_count/404'
error_io = 'downloader/exception_type_count/exceptions.IOError'
if stats.get(error_404) or stats.get(error_io):
if crawler.spider.has_error:
# Return non-zero exit code if exceptions are contained
self.exitcode = 1

View File

@@ -1,16 +1,51 @@
#!/usr/bin/env python
from __future__ import print_function
import os
from posixpath import basename
from six.moves.urllib.parse import urlparse
from .common.spiders import BaseDocumentationSpider
from typing import Any, List, Set
def get_help_images_dir(help_images_path):
# type: (str) -> str
# Get index html file as start url and convert it to file uri
dir_path = os.path.dirname(os.path.realpath(__file__))
target_path = os.path.join(dir_path, os.path.join(*[os.pardir] * 4), help_images_path)
return os.path.realpath(target_path)
class HelpDocumentationSpider(BaseDocumentationSpider):
name = "help_documentation_crawler"
start_urls = ['http://localhost:9981/help']
deny_domains = [] # type: List[str]
deny_domains = [] # type: List[str]
deny = ['/privacy']
help_images_path = "static/images/help"
help_images_static_dir = get_help_images_dir(help_images_path)
def __init__(self, *args, **kwargs):
# type: (*Any, **Any) -> None
super(HelpDocumentationSpider, self).__init__(*args, **kwargs)
self.static_images = set() # type: Set
def _is_external_url(self, url):
# type: (str) -> bool
is_external = url.startswith('http') and 'localhost:9981/help' not in url
if self._has_extension(url) and 'localhost:9981/static/images/help' in url:
self.static_images.add(basename(urlparse(url).path))
return is_external or self._has_extension(url)
def closed(self, *args, **kwargs):
# type: (*Any, **Any) -> None
unused_images = set(os.listdir(self.help_images_static_dir)) - self.static_images
if unused_images:
exception_message = "The following images are not used in help documentation " \
"and can be removed: {}"
self._set_error_state()
unused_images_relatedpath = [
os.path.join(self.help_images_path, img) for img in unused_images]
raise Exception(exception_message.format(', '.join(unused_images_relatedpath)))

View File

@@ -14,12 +14,23 @@ from typing import Any, Generator, List, Optional, Tuple
class BaseDocumentationSpider(scrapy.Spider):
name = None # type: Optional[str]
name = None # type: Optional[str]
# Exclude domain address.
deny_domains = [] # type: List[str]
start_urls = [] # type: List[str]
deny = () # type: Tuple
file_extensions = ['.' + ext for ext in IGNORED_EXTENSIONS] # type: List[str]
deny_domains = [] # type: List[str]
start_urls = [] # type: List[str]
deny = () # type: Tuple
file_extensions = ['.' + ext for ext in IGNORED_EXTENSIONS] # type: List[str]
tags = ('a', 'area', 'img')
attrs = ('href', 'src')
def __init__(self, *args, **kwargs):
# type: (*Any, **Any) -> None
super(BaseDocumentationSpider, self).__init__(*args, **kwargs)
self.has_error = False
def _set_error_state(self):
# type: () -> None
self.has_error = True
def _has_extension(self, url):
# type: (str) -> bool
@@ -43,19 +54,19 @@ class BaseDocumentationSpider(scrapy.Spider):
permalink = m.group('permalink')
# Check permalink existing on response page.
if not response.selector.xpath(xpath_template.format(permalink=permalink)):
self._set_error_state()
raise Exception(
"Permalink #{} is not found on page {}".format(permalink, response.request.url))
def parse(self, response):
# type: (Any) -> Generator[Request, None, None]
self.log(response)
for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=[],
deny=self.deny,
for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=['doc'],
tags=self.tags, attrs=self.attrs, deny=self.deny,
canonicalize=False).extract_links(response):
callback = self.parse # type: Any
dont_filter = False
method = 'GET'
if self._is_external_url(link.url):
callback = self.check_existing
method = 'HEAD'
@@ -76,10 +87,13 @@ class BaseDocumentationSpider(scrapy.Spider):
if hasattr(failure.value, 'response') and failure.value.response:
response = failure.value.response
if response.status == 404:
self._set_error_state()
raise Exception('Page not found: {}'.format(response))
if response.status == 405 and response.request.method == 'HEAD':
# Method 'HEAD' not allowed, repeat request with 'GET'
return self.retry_request_with_get(response.request)
self.log("Error! Please check link: {}".format(response), logging.ERROR)
elif isinstance(failure.type, IOError):
self._set_error_state()
else:
raise Exception(failure.value)