zulip/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py

import json
import os
import re
from collections.abc import Callable, Iterator
from urllib.parse import urlsplit

import scrapy
from scrapy.http import Request, Response
from scrapy.linkextractors import IGNORED_EXTENSIONS
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.spidermiddlewares.httperror import HttpError
from scrapy.utils.url import url_has_any_extension
from twisted.python.failure import Failure

EXCLUDED_DOMAINS = [
    # Returns 429 rate-limiting errors
    "github.com",
    "gist.github.com",
    # Returns 503 errors
    "www.amazon.com",
    "gitlab.com",
]

EXCLUDED_URLS = [
    # Google calendar returns 404s on HEAD requests unconditionally
    "https://calendar.google.com/calendar/embed?src=ktiduof4eoh47lmgcl2qunnc0o@group.calendar.google.com",
    # Returns 409 errors to HEAD requests frequently
    "https://medium.freecodecamp.org/",
    # Returns 404 to HEAD requests unconditionally
    "https://www.git-tower.com/blog/command-line-cheat-sheet/",
    "https://marketplace.visualstudio.com/items?itemName=rafaelmaiolla.remote-vscode",
    "https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh",
    # Requires authentication
    "https://www.linkedin.com/company/zulip-project",
    # Returns 403 errors to HEAD requests
    "https://giphy.com",
    "https://giphy.com/apps/giphycapture",
    "https://www.udemy.com/course/the-complete-react-native-and-redux-course/",
]

VNU_IGNORE = [
    # Real errors that should be fixed.
    r"Attribute “markdown” not allowed on element “div” at this point\.",
    r"No “p” element in scope but a “p” end tag seen\.",
    (
        r"Element “div” not allowed as child of element “ul” in this context\."
        r" \(Suppressing further errors from this subtree\.\)"
    ),
    # Opinionated informational messages.
    r"Trailing slash on void elements has no effect and interacts badly with unquoted attribute values\.",
]
VNU_IGNORE_REGEX = re.compile(r"|".join(VNU_IGNORE))

DEPLOY_ROOT = os.path.abspath(os.path.join(__file__, "../../../../../.."))

ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX = "/zulip/zulip/blob/main"
ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX = "/zulip/zulip/tree/main"


class BaseDocumentationSpider(scrapy.Spider):
    name: str | None = None
    # Exclude domain address.
    deny_domains: list[str] = []
    start_urls: list[str] = []
    deny: list[str] = []
    file_extensions: list[str] = ["." + ext for ext in IGNORED_EXTENSIONS]
    tags = ("a", "area", "img")
    attrs = ("href", "src")

    def _has_extension(self, url: str) -> bool:
        return url_has_any_extension(url, self.file_extensions)

    def _is_external_url(self, url: str) -> bool:
        return url.startswith("http") or self._has_extension(url)

    def check_existing(self, response: Response) -> None:
        self.log(response)

    def _is_external_link(self, url: str) -> bool:
        split_url = urlsplit(url)
        if split_url.hostname in ("chat.zulip.org", "status.zulip.com"):
            # Since most chat.zulip.org URLs will be links to specific
            # logged-in content that the spider cannot verify, or the
            # homepage, there's no need to check those (which can
            # cause errors when chat.zulip.org is being updated).
            #
            # status.zulip.com is externally hosted and, in a peculiar twist of
            # cosmic irony, often itself offline.
            return True
        if split_url.hostname == "zulip.readthedocs.io" or f".{split_url.hostname}".endswith(
            (".zulip.com", ".zulip.org")
        ):
            # We want CI to check any links to Zulip sites.
            return False
        if split_url.scheme == "file" or split_url.hostname == "localhost":
            # We also want CI to check any links to built documentation.
            return False
        if split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
            (
                f"{ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX}/",
                f"{ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX}/",
            )
        ):
            # We can verify these links directly in the local Git repo without making any requests to GitHub servers.
            return False
        if split_url.hostname == "github.com" and split_url.path.startswith("/zulip/"):
            # We want to check these links but due to rate limiting from GitHub, these checks often
            # fail in the CI. Thus, we should treat these as external links for now.
            # TODO: Figure out how to test github.com/zulip links in CI.
            return True
        return True

    def check_fragment(self, response: Response) -> None:
        self.log(response)
        xpath_template = "//*[@id='{fragment}' or @name='{fragment}']"
        fragment = urlsplit(response.request.url).fragment
        # Check fragment existing on response page.
        if not response.selector.xpath(xpath_template.format(fragment=fragment)):
            self.logger.error(
                "Fragment #%s is not found on page %s", fragment, response.request.url
            )

    def _vnu_callback(self, url: str) -> Callable[[Response], None]:
        def callback(response: Response) -> None:
            vnu_out = json.loads(response.text)
            for message in vnu_out["messages"]:
                if not VNU_IGNORE_REGEX.fullmatch(message["message"]):
                    self.logger.error(
                        '"%s":%d.%d-%d.%d: %s: %s',
                        url,
                        message.get("firstLine", message["lastLine"]),
                        message.get("firstColumn", message["lastColumn"]),
                        message["lastLine"],
                        message["lastColumn"],
                        message["type"],
                        message["message"],
                    )

        return callback

    def _make_requests(self, url: str) -> Iterator[Request]:
        # These URLs are for Zulip's web app, which with recent changes
        # can be accessible without logging into an account.  While we
        # do crawl documentation served by the web app (e.g. /help/),
        # we don't want to crawl the web app itself, so we exclude
        # these.
        split_url = urlsplit(url)
        if split_url.netloc == "localhost:9981" and split_url.path in ["", "/"]:
            return

        # These pages have some invisible to the user anchor links like #all
        # that are currently invisible, and thus would otherwise fail this test.
        if url.startswith("http://localhost:9981/communities"):
            return
        if url.startswith("http://localhost:9981/plans"):
            return

        callback: Callable[[Response], Iterator[Request] | None] = self.parse
        dont_filter = False
        method = "GET"
        if self._is_external_url(url):
            callback = self.check_existing
            method = "HEAD"

            if split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
                f"{ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX}/"
            ):
                file_path = (
                    DEPLOY_ROOT + split_url.path[len(ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX) :]
                )
                if not os.path.isfile(file_path):
                    self.logger.error(
                        "There is no local file associated with the GitHub URL: %s", url
                    )
                return
            elif split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
                f"{ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX}/"
            ):
                dir_path = (
                    DEPLOY_ROOT + split_url.path[len(ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX) :]
                )
                if not os.path.isdir(dir_path):
                    self.logger.error(
                        "There is no local directory associated with the GitHub URL: %s", url
                    )
                return
        elif split_url.fragment != "":
            dont_filter = True
            callback = self.check_fragment
        if getattr(self, "skip_external", False) and self._is_external_link(url):
            return
        if split_url.hostname in EXCLUDED_DOMAINS:
            return
        if url in EXCLUDED_URLS:
            return
        yield Request(
            url,
            method=method,
            callback=callback,
            dont_filter=dont_filter,
            errback=self.error_callback,
        )

    def start_requests(self) -> Iterator[Request]:
        for url in self.start_urls:
            yield from self._make_requests(url)

    def parse(self, response: Response) -> Iterator[Request]:
        self.log(response)

        if getattr(self, "validate_html", False):
            yield Request(
                "http://127.0.0.1:9988/?out=json",
                method="POST",
                headers={"Content-Type": response.headers["Content-Type"]},
                body=response.body,
                callback=self._vnu_callback(response.url),
                errback=self.error_callback,
            )

        for link in LxmlLinkExtractor(
            deny_domains=self.deny_domains,
            deny_extensions=["doc"],
            tags=self.tags,
            attrs=self.attrs,
            deny=self.deny,
            canonicalize=False,
        ).extract_links(response):
            yield from self._make_requests(link.url)

    def retry_request_with_get(self, request: Request) -> Iterator[Request]:
        request.method = "GET"
        request.dont_filter = True
        yield request

    def error_callback(self, failure: Failure) -> Failure | Iterator[Request] | None:
        if isinstance(failure.value, HttpError):
            response = failure.value.response
            # Hack: The filtering above does not catch this URL,
            # likely due to a redirect.
            if urlsplit(response.url).netloc == "idmsa.apple.com":
                return None
            if response.status == 405 and response.request.method == "HEAD":
                # Method 'HEAD' not allowed, repeat request with 'GET'
                return self.retry_request_with_get(response.request)
            self.logger.error("Please check link: %s", response.request.url)

        return failure