documentation-crawler: Check images in help documentation.

This checks both that all images under static/images/help/ are used in the help documentation, and also that none of the image tags are broken. - Improve documentation spiders and crawler with spider error state. Fixes #3070.
2025-11-07 07:23:22 +00:00 · 2017-01-10 15:09:24 +06:00
parent 9fecd85e4a
commit aaf82ae090
3 changed files with 60 additions and 13 deletions
--- a/tools/documentation_crawler/documentation_crawler/commands/crawl_with_status.py
+++ b/tools/documentation_crawler/documentation_crawler/commands/crawl_with_status.py
@@ -17,9 +17,7 @@ class StatusCommand(Command):
        self.crawler_process.crawl(crawler)
        self.crawler_process.start()
        # Get exceptions quantity from crawler stat data
-        stats = crawler.stats.get_stats()
-        error_404 = 'downloader/response_status_count/404'
-        error_io = 'downloader/exception_type_count/exceptions.IOError'
-        if stats.get(error_404) or stats.get(error_io):
+
+        if crawler.spider.has_error:
            # Return non-zero exit code if exceptions are contained
            self.exitcode = 1
--- a/tools/documentation_crawler/documentation_crawler/spiders/check_help_documentation.py
+++ b/tools/documentation_crawler/documentation_crawler/spiders/check_help_documentation.py
@@ -1,16 +1,51 @@
 #!/usr/bin/env python
 from __future__ import print_function

+import os
+
+from posixpath import basename
+from six.moves.urllib.parse import urlparse
+
 from .common.spiders import BaseDocumentationSpider

+from typing import Any, List, Set
+
+
+def get_help_images_dir(help_images_path):
+    # type: (str) -> str
+    # Get index html file as start url and convert it to file uri
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    target_path = os.path.join(dir_path, os.path.join(*[os.pardir] * 4), help_images_path)
+    return os.path.realpath(target_path)
+

 class HelpDocumentationSpider(BaseDocumentationSpider):
    name = "help_documentation_crawler"
    start_urls = ['http://localhost:9981/help']
-    deny_domains = [] # type: List[str]
+    deny_domains = []  # type: List[str]
    deny = ['/privacy']
+    help_images_path = "static/images/help"
+    help_images_static_dir = get_help_images_dir(help_images_path)
+
+    def __init__(self, *args, **kwargs):
+        # type: (*Any, **Any) -> None
+        super(HelpDocumentationSpider, self).__init__(*args, **kwargs)
+        self.static_images = set() # type: Set

    def _is_external_url(self, url):
        # type: (str) -> bool
        is_external = url.startswith('http') and 'localhost:9981/help' not in url
+        if self._has_extension(url) and 'localhost:9981/static/images/help' in url:
+            self.static_images.add(basename(urlparse(url).path))
        return is_external or self._has_extension(url)
+
+    def closed(self, *args, **kwargs):
+        # type: (*Any, **Any) -> None
+        unused_images = set(os.listdir(self.help_images_static_dir)) - self.static_images
+        if unused_images:
+            exception_message = "The following images are not used in help documentation " \
+                                "and can be removed: {}"
+            self._set_error_state()
+            unused_images_relatedpath = [
+                os.path.join(self.help_images_path, img) for img in unused_images]
+            raise Exception(exception_message.format(', '.join(unused_images_relatedpath)))
--- a/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py
+++ b/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py
@@ -14,12 +14,23 @@ from typing import Any, Generator, List, Optional, Tuple


 class BaseDocumentationSpider(scrapy.Spider):
-    name = None # type: Optional[str]
+    name = None  # type: Optional[str]
    # Exclude domain address.
-    deny_domains = [] # type: List[str]
-    start_urls = [] # type: List[str]
-    deny = () # type: Tuple
-    file_extensions = ['.' + ext for ext in IGNORED_EXTENSIONS] # type: List[str]
+    deny_domains = []  # type: List[str]
+    start_urls = []  # type: List[str]
+    deny = ()  # type: Tuple
+    file_extensions = ['.' + ext for ext in IGNORED_EXTENSIONS]  # type: List[str]
+    tags = ('a', 'area', 'img')
+    attrs = ('href', 'src')
+
+    def __init__(self, *args, **kwargs):
+        # type: (*Any, **Any) -> None
+        super(BaseDocumentationSpider, self).__init__(*args, **kwargs)
+        self.has_error = False
+
+    def _set_error_state(self):
+        # type: () -> None
+        self.has_error = True

    def _has_extension(self, url):
        # type: (str) -> bool
@@ -43,19 +54,19 @@ class BaseDocumentationSpider(scrapy.Spider):
        permalink = m.group('permalink')
        # Check permalink existing on response page.
        if not response.selector.xpath(xpath_template.format(permalink=permalink)):
+            self._set_error_state()
            raise Exception(
                "Permalink #{} is not found on page {}".format(permalink, response.request.url))

    def parse(self, response):
        # type: (Any) -> Generator[Request, None, None]
        self.log(response)
-        for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=[],
-                                      deny=self.deny,
+        for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=['doc'],
+                                      tags=self.tags, attrs=self.attrs, deny=self.deny,
                                      canonicalize=False).extract_links(response):
            callback = self.parse  # type: Any
            dont_filter = False
            method = 'GET'
-
            if self._is_external_url(link.url):
                callback = self.check_existing
                method = 'HEAD'
@@ -76,10 +87,13 @@ class BaseDocumentationSpider(scrapy.Spider):
        if hasattr(failure.value, 'response') and failure.value.response:
            response = failure.value.response
            if response.status == 404:
+                self._set_error_state()
                raise Exception('Page not found: {}'.format(response))
            if response.status == 405 and response.request.method == 'HEAD':
                # Method 'HEAD' not allowed, repeat request with 'GET'
                return self.retry_request_with_get(response.request)
            self.log("Error! Please check link: {}".format(response), logging.ERROR)
+        elif isinstance(failure.type, IOError):
+            self._set_error_state()
        else:
            raise Exception(failure.value)