Files
zulip/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py
2024-07-13 22:28:22 -07:00

249 lines
10 KiB
Python

import json
import os
import re
from collections.abc import Callable, Iterator
from urllib.parse import urlsplit
import scrapy
from scrapy.http import Request, Response
from scrapy.linkextractors import IGNORED_EXTENSIONS
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.spidermiddlewares.httperror import HttpError
from scrapy.utils.url import url_has_any_extension
from twisted.python.failure import Failure
EXCLUDED_DOMAINS = [
# Returns 429 rate-limiting errors
"github.com",
"gist.github.com",
# Returns 503 errors
"www.amazon.com",
"gitlab.com",
]
EXCLUDED_URLS = [
# Google calendar returns 404s on HEAD requests unconditionally
"https://calendar.google.com/calendar/embed?src=ktiduof4eoh47lmgcl2qunnc0o@group.calendar.google.com",
# Returns 409 errors to HEAD requests frequently
"https://medium.freecodecamp.org/",
# Returns 404 to HEAD requests unconditionally
"https://www.git-tower.com/blog/command-line-cheat-sheet/",
"https://marketplace.visualstudio.com/items?itemName=rafaelmaiolla.remote-vscode",
"https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh",
# Requires authentication
"https://www.linkedin.com/company/zulip-project",
# Returns 403 errors to HEAD requests
"https://giphy.com",
"https://giphy.com/apps/giphycapture",
"https://www.udemy.com/course/the-complete-react-native-and-redux-course/",
]
VNU_IGNORE = [
# Real errors that should be fixed.
r"Attribute “markdown” not allowed on element “div” at this point\.",
r"No “p” element in scope but a “p” end tag seen\.",
(
r"Element “div” not allowed as child of element “ul” in this context\."
r" \(Suppressing further errors from this subtree\.\)"
),
# Opinionated informational messages.
r"Trailing slash on void elements has no effect and interacts badly with unquoted attribute values\.",
]
VNU_IGNORE_REGEX = re.compile(r"|".join(VNU_IGNORE))
DEPLOY_ROOT = os.path.abspath(os.path.join(__file__, "../../../../../.."))
ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX = "/zulip/zulip/blob/main"
ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX = "/zulip/zulip/tree/main"
class BaseDocumentationSpider(scrapy.Spider):
name: str | None = None
# Exclude domain address.
deny_domains: list[str] = []
start_urls: list[str] = []
deny: list[str] = []
file_extensions: list[str] = ["." + ext for ext in IGNORED_EXTENSIONS]
tags = ("a", "area", "img")
attrs = ("href", "src")
def _has_extension(self, url: str) -> bool:
return url_has_any_extension(url, self.file_extensions)
def _is_external_url(self, url: str) -> bool:
return url.startswith("http") or self._has_extension(url)
def check_existing(self, response: Response) -> None:
self.log(response)
def _is_external_link(self, url: str) -> bool:
split_url = urlsplit(url)
if split_url.hostname in ("chat.zulip.org", "status.zulip.com"):
# Since most chat.zulip.org URLs will be links to specific
# logged-in content that the spider cannot verify, or the
# homepage, there's no need to check those (which can
# cause errors when chat.zulip.org is being updated).
#
# status.zulip.com is externally hosted and, in a peculiar twist of
# cosmic irony, often itself offline.
return True
if split_url.hostname == "zulip.readthedocs.io" or f".{split_url.hostname}".endswith(
(".zulip.com", ".zulip.org")
):
# We want CI to check any links to Zulip sites.
return False
if split_url.scheme == "file" or split_url.hostname == "localhost":
# We also want CI to check any links to built documentation.
return False
if split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
(
f"{ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX}/",
f"{ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX}/",
)
):
# We can verify these links directly in the local Git repo without making any requests to GitHub servers.
return False
if split_url.hostname == "github.com" and split_url.path.startswith("/zulip/"):
# We want to check these links but due to rate limiting from GitHub, these checks often
# fail in the CI. Thus, we should treat these as external links for now.
# TODO: Figure out how to test github.com/zulip links in CI.
return True
return True
def check_fragment(self, response: Response) -> None:
self.log(response)
xpath_template = "//*[@id='{fragment}' or @name='{fragment}']"
fragment = urlsplit(response.request.url).fragment
# Check fragment existing on response page.
if not response.selector.xpath(xpath_template.format(fragment=fragment)):
self.logger.error(
"Fragment #%s is not found on page %s", fragment, response.request.url
)
def _vnu_callback(self, url: str) -> Callable[[Response], None]:
def callback(response: Response) -> None:
vnu_out = json.loads(response.text)
for message in vnu_out["messages"]:
if not VNU_IGNORE_REGEX.fullmatch(message["message"]):
self.logger.error(
'"%s":%d.%d-%d.%d: %s: %s',
url,
message.get("firstLine", message["lastLine"]),
message.get("firstColumn", message["lastColumn"]),
message["lastLine"],
message["lastColumn"],
message["type"],
message["message"],
)
return callback
def _make_requests(self, url: str) -> Iterator[Request]:
# These URLs are for Zulip's web app, which with recent changes
# can be accessible without logging into an account. While we
# do crawl documentation served by the web app (e.g. /help/),
# we don't want to crawl the web app itself, so we exclude
# these.
split_url = urlsplit(url)
if split_url.netloc == "localhost:9981" and split_url.path in ["", "/"]:
return
# These pages have some invisible to the user anchor links like #all
# that are currently invisible, and thus would otherwise fail this test.
if url.startswith("http://localhost:9981/communities"):
return
if url.startswith("http://localhost:9981/plans"):
return
callback: Callable[[Response], Iterator[Request] | None] = self.parse
dont_filter = False
method = "GET"
if self._is_external_url(url):
callback = self.check_existing
method = "HEAD"
if split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
f"{ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX}/"
):
file_path = (
DEPLOY_ROOT + split_url.path[len(ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX) :]
)
if not os.path.isfile(file_path):
self.logger.error(
"There is no local file associated with the GitHub URL: %s", url
)
return
elif split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
f"{ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX}/"
):
dir_path = (
DEPLOY_ROOT + split_url.path[len(ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX) :]
)
if not os.path.isdir(dir_path):
self.logger.error(
"There is no local directory associated with the GitHub URL: %s", url
)
return
elif split_url.fragment != "":
dont_filter = True
callback = self.check_fragment
if getattr(self, "skip_external", False) and self._is_external_link(url):
return
if split_url.hostname in EXCLUDED_DOMAINS:
return
if url in EXCLUDED_URLS:
return
yield Request(
url,
method=method,
callback=callback,
dont_filter=dont_filter,
errback=self.error_callback,
)
def start_requests(self) -> Iterator[Request]:
for url in self.start_urls:
yield from self._make_requests(url)
def parse(self, response: Response) -> Iterator[Request]:
self.log(response)
if getattr(self, "validate_html", False):
yield Request(
"http://127.0.0.1:9988/?out=json",
method="POST",
headers={"Content-Type": response.headers["Content-Type"]},
body=response.body,
callback=self._vnu_callback(response.url),
errback=self.error_callback,
)
for link in LxmlLinkExtractor(
deny_domains=self.deny_domains,
deny_extensions=["doc"],
tags=self.tags,
attrs=self.attrs,
deny=self.deny,
canonicalize=False,
).extract_links(response):
yield from self._make_requests(link.url)
def retry_request_with_get(self, request: Request) -> Iterator[Request]:
request.method = "GET"
request.dont_filter = True
yield request
def error_callback(self, failure: Failure) -> Failure | Iterator[Request] | None:
if isinstance(failure.value, HttpError):
response = failure.value.response
# Hack: The filtering above does not catch this URL,
# likely due to a redirect.
if urlsplit(response.url).netloc == "idmsa.apple.com":
return None
if response.status == 405 and response.request.method == "HEAD":
# Method 'HEAD' not allowed, repeat request with 'GET'
return self.retry_request_with_get(response.request)
self.logger.error("Please check link: %s", response.request.url)
return failure