mirror of
https://github.com/zulip/zulip.git
synced 2025-11-16 20:02:15 +00:00
249 lines
10 KiB
Python
249 lines
10 KiB
Python
import json
|
|
import os
|
|
import re
|
|
from collections.abc import Callable, Iterator
|
|
from urllib.parse import urlsplit
|
|
|
|
import scrapy
|
|
from scrapy.http import Request, Response
|
|
from scrapy.linkextractors import IGNORED_EXTENSIONS
|
|
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
|
|
from scrapy.spidermiddlewares.httperror import HttpError
|
|
from scrapy.utils.url import url_has_any_extension
|
|
from twisted.python.failure import Failure
|
|
|
|
EXCLUDED_DOMAINS = [
|
|
# Returns 429 rate-limiting errors
|
|
"github.com",
|
|
"gist.github.com",
|
|
# Returns 503 errors
|
|
"www.amazon.com",
|
|
"gitlab.com",
|
|
]
|
|
|
|
EXCLUDED_URLS = [
|
|
# Google calendar returns 404s on HEAD requests unconditionally
|
|
"https://calendar.google.com/calendar/embed?src=ktiduof4eoh47lmgcl2qunnc0o@group.calendar.google.com",
|
|
# Returns 409 errors to HEAD requests frequently
|
|
"https://medium.freecodecamp.org/",
|
|
# Returns 404 to HEAD requests unconditionally
|
|
"https://www.git-tower.com/blog/command-line-cheat-sheet/",
|
|
"https://marketplace.visualstudio.com/items?itemName=rafaelmaiolla.remote-vscode",
|
|
"https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh",
|
|
# Requires authentication
|
|
"https://www.linkedin.com/company/zulip-project",
|
|
# Returns 403 errors to HEAD requests
|
|
"https://giphy.com",
|
|
"https://giphy.com/apps/giphycapture",
|
|
"https://www.udemy.com/course/the-complete-react-native-and-redux-course/",
|
|
]
|
|
|
|
VNU_IGNORE = [
|
|
# Real errors that should be fixed.
|
|
r"Attribute “markdown” not allowed on element “div” at this point\.",
|
|
r"No “p” element in scope but a “p” end tag seen\.",
|
|
(
|
|
r"Element “div” not allowed as child of element “ul” in this context\."
|
|
r" \(Suppressing further errors from this subtree\.\)"
|
|
),
|
|
# Opinionated informational messages.
|
|
r"Trailing slash on void elements has no effect and interacts badly with unquoted attribute values\.",
|
|
]
|
|
VNU_IGNORE_REGEX = re.compile(r"|".join(VNU_IGNORE))
|
|
|
|
DEPLOY_ROOT = os.path.abspath(os.path.join(__file__, "../../../../../.."))
|
|
|
|
ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX = "/zulip/zulip/blob/main"
|
|
ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX = "/zulip/zulip/tree/main"
|
|
|
|
|
|
class BaseDocumentationSpider(scrapy.Spider):
|
|
name: str | None = None
|
|
# Exclude domain address.
|
|
deny_domains: list[str] = []
|
|
start_urls: list[str] = []
|
|
deny: list[str] = []
|
|
file_extensions: list[str] = ["." + ext for ext in IGNORED_EXTENSIONS]
|
|
tags = ("a", "area", "img")
|
|
attrs = ("href", "src")
|
|
|
|
def _has_extension(self, url: str) -> bool:
|
|
return url_has_any_extension(url, self.file_extensions)
|
|
|
|
def _is_external_url(self, url: str) -> bool:
|
|
return url.startswith("http") or self._has_extension(url)
|
|
|
|
def check_existing(self, response: Response) -> None:
|
|
self.log(response)
|
|
|
|
def _is_external_link(self, url: str) -> bool:
|
|
split_url = urlsplit(url)
|
|
if split_url.hostname in ("chat.zulip.org", "status.zulip.com"):
|
|
# Since most chat.zulip.org URLs will be links to specific
|
|
# logged-in content that the spider cannot verify, or the
|
|
# homepage, there's no need to check those (which can
|
|
# cause errors when chat.zulip.org is being updated).
|
|
#
|
|
# status.zulip.com is externally hosted and, in a peculiar twist of
|
|
# cosmic irony, often itself offline.
|
|
return True
|
|
if split_url.hostname == "zulip.readthedocs.io" or f".{split_url.hostname}".endswith(
|
|
(".zulip.com", ".zulip.org")
|
|
):
|
|
# We want CI to check any links to Zulip sites.
|
|
return False
|
|
if split_url.scheme == "file" or split_url.hostname == "localhost":
|
|
# We also want CI to check any links to built documentation.
|
|
return False
|
|
if split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
|
|
(
|
|
f"{ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX}/",
|
|
f"{ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX}/",
|
|
)
|
|
):
|
|
# We can verify these links directly in the local Git repo without making any requests to GitHub servers.
|
|
return False
|
|
if split_url.hostname == "github.com" and split_url.path.startswith("/zulip/"):
|
|
# We want to check these links but due to rate limiting from GitHub, these checks often
|
|
# fail in the CI. Thus, we should treat these as external links for now.
|
|
# TODO: Figure out how to test github.com/zulip links in CI.
|
|
return True
|
|
return True
|
|
|
|
def check_fragment(self, response: Response) -> None:
|
|
self.log(response)
|
|
xpath_template = "//*[@id='{fragment}' or @name='{fragment}']"
|
|
fragment = urlsplit(response.request.url).fragment
|
|
# Check fragment existing on response page.
|
|
if not response.selector.xpath(xpath_template.format(fragment=fragment)):
|
|
self.logger.error(
|
|
"Fragment #%s is not found on page %s", fragment, response.request.url
|
|
)
|
|
|
|
def _vnu_callback(self, url: str) -> Callable[[Response], None]:
|
|
def callback(response: Response) -> None:
|
|
vnu_out = json.loads(response.text)
|
|
for message in vnu_out["messages"]:
|
|
if not VNU_IGNORE_REGEX.fullmatch(message["message"]):
|
|
self.logger.error(
|
|
'"%s":%d.%d-%d.%d: %s: %s',
|
|
url,
|
|
message.get("firstLine", message["lastLine"]),
|
|
message.get("firstColumn", message["lastColumn"]),
|
|
message["lastLine"],
|
|
message["lastColumn"],
|
|
message["type"],
|
|
message["message"],
|
|
)
|
|
|
|
return callback
|
|
|
|
def _make_requests(self, url: str) -> Iterator[Request]:
|
|
# These URLs are for Zulip's web app, which with recent changes
|
|
# can be accessible without logging into an account. While we
|
|
# do crawl documentation served by the web app (e.g. /help/),
|
|
# we don't want to crawl the web app itself, so we exclude
|
|
# these.
|
|
split_url = urlsplit(url)
|
|
if split_url.netloc == "localhost:9981" and split_url.path in ["", "/"]:
|
|
return
|
|
|
|
# These pages have some invisible to the user anchor links like #all
|
|
# that are currently invisible, and thus would otherwise fail this test.
|
|
if url.startswith("http://localhost:9981/communities"):
|
|
return
|
|
if url.startswith("http://localhost:9981/plans"):
|
|
return
|
|
|
|
callback: Callable[[Response], Iterator[Request] | None] = self.parse
|
|
dont_filter = False
|
|
method = "GET"
|
|
if self._is_external_url(url):
|
|
callback = self.check_existing
|
|
method = "HEAD"
|
|
|
|
if split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
|
|
f"{ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX}/"
|
|
):
|
|
file_path = (
|
|
DEPLOY_ROOT + split_url.path[len(ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX) :]
|
|
)
|
|
if not os.path.isfile(file_path):
|
|
self.logger.error(
|
|
"There is no local file associated with the GitHub URL: %s", url
|
|
)
|
|
return
|
|
elif split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
|
|
f"{ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX}/"
|
|
):
|
|
dir_path = (
|
|
DEPLOY_ROOT + split_url.path[len(ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX) :]
|
|
)
|
|
if not os.path.isdir(dir_path):
|
|
self.logger.error(
|
|
"There is no local directory associated with the GitHub URL: %s", url
|
|
)
|
|
return
|
|
elif split_url.fragment != "":
|
|
dont_filter = True
|
|
callback = self.check_fragment
|
|
if getattr(self, "skip_external", False) and self._is_external_link(url):
|
|
return
|
|
if split_url.hostname in EXCLUDED_DOMAINS:
|
|
return
|
|
if url in EXCLUDED_URLS:
|
|
return
|
|
yield Request(
|
|
url,
|
|
method=method,
|
|
callback=callback,
|
|
dont_filter=dont_filter,
|
|
errback=self.error_callback,
|
|
)
|
|
|
|
def start_requests(self) -> Iterator[Request]:
|
|
for url in self.start_urls:
|
|
yield from self._make_requests(url)
|
|
|
|
def parse(self, response: Response) -> Iterator[Request]:
|
|
self.log(response)
|
|
|
|
if getattr(self, "validate_html", False):
|
|
yield Request(
|
|
"http://127.0.0.1:9988/?out=json",
|
|
method="POST",
|
|
headers={"Content-Type": response.headers["Content-Type"]},
|
|
body=response.body,
|
|
callback=self._vnu_callback(response.url),
|
|
errback=self.error_callback,
|
|
)
|
|
|
|
for link in LxmlLinkExtractor(
|
|
deny_domains=self.deny_domains,
|
|
deny_extensions=["doc"],
|
|
tags=self.tags,
|
|
attrs=self.attrs,
|
|
deny=self.deny,
|
|
canonicalize=False,
|
|
).extract_links(response):
|
|
yield from self._make_requests(link.url)
|
|
|
|
def retry_request_with_get(self, request: Request) -> Iterator[Request]:
|
|
request.method = "GET"
|
|
request.dont_filter = True
|
|
yield request
|
|
|
|
def error_callback(self, failure: Failure) -> Failure | Iterator[Request] | None:
|
|
if isinstance(failure.value, HttpError):
|
|
response = failure.value.response
|
|
# Hack: The filtering above does not catch this URL,
|
|
# likely due to a redirect.
|
|
if urlsplit(response.url).netloc == "idmsa.apple.com":
|
|
return None
|
|
if response.status == 405 and response.request.method == "HEAD":
|
|
# Method 'HEAD' not allowed, repeat request with 'GET'
|
|
return self.retry_request_with_get(response.request)
|
|
self.logger.error("Please check link: %s", response.request.url)
|
|
|
|
return failure
|