python: Clean up janky URL matching code with urlsplit.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
2025-11-09 00:18:12 +00:00 · 2023-01-18 12:40:44 -05:00
parent 1e96c6e9a2
commit 8f7a7877fe
5 changed files with 53 additions and 37 deletions
--- a/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py
+++ b/tools/documentation_crawler/documentation_crawler/spiders/common/spiders.py
@@ -2,7 +2,7 @@ import json
 import os
 import re
 from typing import Callable, Iterator, List, Optional, Union
-from urllib.parse import urlparse
+from urllib.parse import urlparse, urlsplit

 import scrapy
 from scrapy.http import Request, Response
@@ -56,8 +56,8 @@ VNU_IGNORE_REGEX = re.compile(r"|".join(VNU_IGNORE))

 DEPLOY_ROOT = os.path.abspath(os.path.join(__file__, "../../../../../.."))

-ZULIP_SERVER_GITHUB_FILE_URL_PREFIX = "https://github.com/zulip/zulip/blob/main"
-ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX = "https://github.com/zulip/zulip/tree/main"
+ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX = "/zulip/zulip/blob/main"
+ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX = "/zulip/zulip/tree/main"


 class BaseDocumentationSpider(scrapy.Spider):
@@ -80,24 +80,30 @@ class BaseDocumentationSpider(scrapy.Spider):
        self.log(response)

    def _is_external_link(self, url: str) -> bool:
-        if url.startswith("https://chat.zulip.org"):
+        split_url = urlsplit(url)
+        if split_url.hostname == "chat.zulip.org":
            # Since most chat.zulip.org URLs will be links to specific
            # logged-in content that the spider cannot verify, or the
            # homepage, there's no need to check those (which can
            # cause errors when chat.zulip.org is being updated).
            return True
-        if "zulip.readthedocs" in url or "zulip.com" in url or "zulip.org" in url:
+        if split_url.hostname == "zulip.readthedocs.io" or f".{split_url.hostname}".endswith(
+            (".zulip.com", ".zulip.org")
+        ):
            # We want CI to check any links to Zulip sites.
            return False
-        if (len(url) > 4 and url[:4] == "file") or ("localhost" in url):
+        if split_url.scheme == "file" or split_url.hostname == "localhost":
            # We also want CI to check any links to built documentation.
            return False
-        if url.startswith(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX) or url.startswith(
-            ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX
+        if split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
+            (
+                f"{ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX}/",
+                f"{ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX}/",
+            )
        ):
            # We can verify these links directly in the local Git repo without making any requests to GitHub servers.
            return False
-        if "github.com/zulip" in url:
+        if split_url.hostname == "github.com" and split_url.path.startswith("/zulip/"):
            # We want to check these links but due to rate limiting from GitHub, these checks often
            # fail in the CI. Thus, we should treat these as external links for now.
            # TODO: Figure out how to test github.com/zulip links in CI.
@@ -107,10 +113,7 @@ class BaseDocumentationSpider(scrapy.Spider):
    def check_fragment(self, response: Response) -> None:
        self.log(response)
        xpath_template = "//*[@id='{fragment}' or @name='{fragment}']"
-        m = re.match(r".+\#(?P<fragment>.*)$", response.request.url)  # Get fragment value.
-        if not m:
-            return
-        fragment = m.group("fragment")
+        fragment = urlsplit(response.request.url).fragment
        # Check fragment existing on response page.
        if not response.selector.xpath(xpath_template.format(fragment=fragment)):
            self.logger.error(
@@ -141,11 +144,8 @@ class BaseDocumentationSpider(scrapy.Spider):
        # do crawl documentation served by the web app (e.g. /help/),
        # we don't want to crawl the web app itself, so we exclude
        # these.
-        if (
-            url in ["http://localhost:9981/", "http://localhost:9981"]
-            or url.startswith("http://localhost:9981/#")
-            or url.startswith("http://localhost:9981#")
-        ):
+        split_url = urlsplit(url)
+        if split_url.netloc == "localhost:9981" and split_url.path in ["", "/"]:
            return

        # This page has some invisible to the user anchor links like #all
@@ -160,29 +160,34 @@ class BaseDocumentationSpider(scrapy.Spider):
            callback = self.check_existing
            method = "HEAD"

-            if url.startswith(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX):
-                file_path = url.replace(ZULIP_SERVER_GITHUB_FILE_URL_PREFIX, DEPLOY_ROOT)
-                hash_index = file_path.find("#")
-                if hash_index != -1:
-                    file_path = file_path[:hash_index]
+            if split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
+                f"{ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX}/"
+            ):
+                file_path = (
+                    DEPLOY_ROOT + split_url.path[len(ZULIP_SERVER_GITHUB_FILE_PATH_PREFIX) :]
+                )
                if not os.path.isfile(file_path):
                    self.logger.error(
                        "There is no local file associated with the GitHub URL: %s", url
                    )
                return
-            elif url.startswith(ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX):
-                dir_path = url.replace(ZULIP_SERVER_GITHUB_DIRECTORY_URL_PREFIX, DEPLOY_ROOT)
+            elif split_url.hostname == "github.com" and f"{split_url.path}/".startswith(
+                f"{ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX}/"
+            ):
+                dir_path = (
+                    DEPLOY_ROOT + split_url.path[len(ZULIP_SERVER_GITHUB_DIRECTORY_PATH_PREFIX) :]
+                )
                if not os.path.isdir(dir_path):
                    self.logger.error(
                        "There is no local directory associated with the GitHub URL: %s", url
                    )
                return
-        elif "#" in url:
+        elif split_url.fragment != "":
            dont_filter = True
            callback = self.check_fragment
        if getattr(self, "skip_external", False) and self._is_external_link(url):
            return
-        if urlparse(url).netloc in EXCLUDED_DOMAINS:
+        if split_url.hostname in EXCLUDED_DOMAINS:
            return
        if url in EXCLUDED_URLS:
            return