documentation-crawler: Check images in help documentation.

This checks both that all images under static/images/help/ are used in the help documentation, and also that none of the image tags are broken. - Improve documentation spiders and crawler with spider error state. Fixes #3070.
2025-11-15 11:22:04 +00:00 · 2017-01-10 15:09:24 +06:00
parent 9fecd85e4a
commit aaf82ae090
3 changed files with 60 additions and 13 deletions
--- a/tools/documentation_crawler/documentation_crawler/commands/crawl_with_status.py
+++ b/tools/documentation_crawler/documentation_crawler/commands/crawl_with_status.py
@@ -17,9 +17,7 @@ class StatusCommand(Command):
        self.crawler_process.crawl(crawler)
        self.crawler_process.start()
        # Get exceptions quantity from crawler stat data
-        stats = crawler.stats.get_stats()
+
-        error_404 = 'downloader/response_status_count/404'
+        if crawler.spider.has_error:
        error_io = 'downloader/exception_type_count/exceptions.IOError'
        if stats.get(error_404) or stats.get(error_io):
            # Return non-zero exit code if exceptions are contained
            self.exitcode = 1
--- a/tools/documentation_crawler/documentation_crawler/spiders/check_help_documentation.py
+++ b/tools/documentation_crawler/documentation_crawler/spiders/check_help_documentation.py
@@ -1,16 +1,51 @@
 #!/usr/bin/env python
 from __future__ import print_function
 import os
 from posixpath import basename
 from six.moves.urllib.parse import urlparse
 from .common.spiders import BaseDocumentationSpider
 from typing import Any, List, Set
 def get_help_images_dir(help_images_path):
    # type: (str) -> str
    # Get index html file as start url and convert it to file uri
    dir_path = os.path.dirname(os.path.realpath(__file__))
    target_path = os.path.join(dir_path, os.path.join(*[os.pardir] * 4), help_images_path)
    return os.path.realpath(target_path)
 class HelpDocumentationSpider(BaseDocumentationSpider):
    name = "help_documentation_crawler"
    start_urls = ['http://localhost:9981/help']
    deny_domains = []  # type: List[str]
    deny = ['/privacy']
    help_images_path = "static/images/help"
    help_images_static_dir = get_help_images_dir(help_images_path)
    def __init__(self, *args, **kwargs):
        # type: (*Any, **Any) -> None
        super(HelpDocumentationSpider, self).__init__(*args, **kwargs)
        self.static_images = set() # type: Set
    def _is_external_url(self, url):
        # type: (str) -> bool
        is_external = url.startswith('http') and 'localhost:9981/help' not in url
        if self._has_extension(url) and 'localhost:9981/static/images/help' in url:
            self.static_images.add(basename(urlparse(url).path))
        return is_external or self._has_extension(url)
    def closed(self, *args, **kwargs):
        # type: (*Any, **Any) -> None
        unused_images = set(os.listdir(self.help_images_static_dir)) - self.static_images
        if unused_images:
            exception_message = "The following images are not used in help documentation " \
                                "and can be removed: {}"
            self._set_error_state()
            unused_images_relatedpath = [
                os.path.join(self.help_images_path, img) for img in unused_images]
            raise Exception(exception_message.format(', '.join(unused_images_relatedpath)))
--- a/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py
+++ b/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py
@@ -20,6 +20,17 @@ class BaseDocumentationSpider(scrapy.Spider):
    start_urls = []  # type: List[str]
    deny = ()  # type: Tuple
    file_extensions = ['.' + ext for ext in IGNORED_EXTENSIONS]  # type: List[str]
    tags = ('a', 'area', 'img')
    attrs = ('href', 'src')
    def __init__(self, *args, **kwargs):
        # type: (*Any, **Any) -> None
        super(BaseDocumentationSpider, self).__init__(*args, **kwargs)
        self.has_error = False
    def _set_error_state(self):
        # type: () -> None
        self.has_error = True
    def _has_extension(self, url):
        # type: (str) -> bool
@@ -43,19 +54,19 @@ class BaseDocumentationSpider(scrapy.Spider):
        permalink = m.group('permalink')
        # Check permalink existing on response page.
        if not response.selector.xpath(xpath_template.format(permalink=permalink)):
            self._set_error_state()
            raise Exception(
                "Permalink #{} is not found on page {}".format(permalink, response.request.url))
    def parse(self, response):
        # type: (Any) -> Generator[Request, None, None]
        self.log(response)
-        for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=[],
+        for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=['doc'],
-                                      deny=self.deny,
+                                      tags=self.tags, attrs=self.attrs, deny=self.deny,
                                      canonicalize=False).extract_links(response):
            callback = self.parse  # type: Any
            dont_filter = False
            method = 'GET'
            if self._is_external_url(link.url):
                callback = self.check_existing
                method = 'HEAD'
@@ -76,10 +87,13 @@ class BaseDocumentationSpider(scrapy.Spider):
        if hasattr(failure.value, 'response') and failure.value.response:
            response = failure.value.response
            if response.status == 404:
                self._set_error_state()
                raise Exception('Page not found: {}'.format(response))
            if response.status == 405 and response.request.method == 'HEAD':
                # Method 'HEAD' not allowed, repeat request with 'GET'
                return self.retry_request_with_get(response.request)
            self.log("Error! Please check link: {}".format(response), logging.ERROR)
        elif isinstance(failure.type, IOError):
            self._set_error_state()
        else:
            raise Exception(failure.value)