mirror of
https://github.com/zulip/zulip.git
synced 2025-11-15 03:11:54 +00:00
test-help-documentation: Validate HTML with vnu.jar.
The VNU_IGNORE whitelist lets in some crazy-invalid preexisting HTML, but hopefully this will stop the problem from getting much larger. Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
This commit is contained in:
committed by
Tim Abbott
parent
f407a12ba2
commit
2d50dcf7cc
@@ -1,3 +1,4 @@
|
|||||||
|
import json
|
||||||
import re
|
import re
|
||||||
import scrapy
|
import scrapy
|
||||||
|
|
||||||
@@ -24,6 +25,17 @@ EXCLUDED_URLS = [
|
|||||||
'https://www.linkedin.com/company/zulip-project',
|
'https://www.linkedin.com/company/zulip-project',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
VNU_IGNORE = re.compile(r'|'.join([
|
||||||
|
# Real errors that should be fixed.
|
||||||
|
r'Duplicate ID “[^”]*”\.',
|
||||||
|
r'The first occurrence of ID “[^”]*” was here\.',
|
||||||
|
r'Attribute “markdown” not allowed on element “div” at this point\.',
|
||||||
|
r'No “p” element in scope but a “p” end tag seen\.',
|
||||||
|
|
||||||
|
# Warnings that are probably less important.
|
||||||
|
r'The “type” attribute is unnecessary for JavaScript resources\.',
|
||||||
|
]))
|
||||||
|
|
||||||
|
|
||||||
class BaseDocumentationSpider(scrapy.Spider):
|
class BaseDocumentationSpider(scrapy.Spider):
|
||||||
name = None # type: Optional[str]
|
name = None # type: Optional[str]
|
||||||
@@ -68,6 +80,24 @@ class BaseDocumentationSpider(scrapy.Spider):
|
|||||||
self.logger.error(
|
self.logger.error(
|
||||||
"Fragment #%s is not found on page %s", fragment, response.request.url)
|
"Fragment #%s is not found on page %s", fragment, response.request.url)
|
||||||
|
|
||||||
|
def _vnu_callback(self, url: str) -> Callable[[Response], None]:
|
||||||
|
def callback(response: Response) -> None:
|
||||||
|
vnu_out = json.loads(response.text)
|
||||||
|
for message in vnu_out['messages']:
|
||||||
|
if not VNU_IGNORE.fullmatch(message['message']):
|
||||||
|
self.logger.error(
|
||||||
|
'"%s":%d.%d-%d.%d: %s: %s',
|
||||||
|
url,
|
||||||
|
message.get('firstLine', message['lastLine']),
|
||||||
|
message.get('firstColumn', message['lastColumn']),
|
||||||
|
message['lastLine'],
|
||||||
|
message['lastColumn'],
|
||||||
|
message['type'],
|
||||||
|
message['message'],
|
||||||
|
)
|
||||||
|
|
||||||
|
return callback
|
||||||
|
|
||||||
def _make_requests(self, url: str) -> Iterable[Request]:
|
def _make_requests(self, url: str) -> Iterable[Request]:
|
||||||
callback = self.parse # type: Callable[[Response], Optional[Iterable[Request]]]
|
callback = self.parse # type: Callable[[Response], Optional[Iterable[Request]]]
|
||||||
dont_filter = False
|
dont_filter = False
|
||||||
@@ -89,6 +119,17 @@ class BaseDocumentationSpider(scrapy.Spider):
|
|||||||
|
|
||||||
def parse(self, response: Response) -> Iterable[Request]:
|
def parse(self, response: Response) -> Iterable[Request]:
|
||||||
self.log(response)
|
self.log(response)
|
||||||
|
|
||||||
|
if getattr(self, 'validate_html', False):
|
||||||
|
yield Request(
|
||||||
|
'http://localhost:9988/?out=json',
|
||||||
|
method='POST',
|
||||||
|
headers={'Content-Type': response.headers['Content-Type']},
|
||||||
|
body=response.body,
|
||||||
|
callback=self._vnu_callback(response.url),
|
||||||
|
errback=self.error_callback,
|
||||||
|
)
|
||||||
|
|
||||||
for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=['doc'],
|
for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=['doc'],
|
||||||
tags=self.tags, attrs=self.attrs, deny=self.deny,
|
tags=self.tags, attrs=self.attrs, deny=self.deny,
|
||||||
canonicalize=False).extract_links(response):
|
canonicalize=False).extract_links(response):
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import argparse
|
import argparse
|
||||||
|
import contextlib
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import subprocess
|
import subprocess
|
||||||
from typing import List
|
from typing import Iterator
|
||||||
|
|
||||||
ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
@@ -30,13 +31,28 @@ os.makedirs('var/help-documentation', exist_ok=True)
|
|||||||
LOG_FILE = 'var/help-documentation/server.log'
|
LOG_FILE = 'var/help-documentation/server.log'
|
||||||
external_host = "localhost:9981"
|
external_host = "localhost:9981"
|
||||||
|
|
||||||
extra_args = [] # type: List[str]
|
extra_args = ['-a', 'validate_html=set']
|
||||||
|
|
||||||
if options.skip_external_link_check:
|
if options.skip_external_link_check:
|
||||||
extra_args = ['-a', 'skip_external=set']
|
extra_args += ['-a', 'skip_external=set']
|
||||||
|
|
||||||
with test_server_running(options.force, external_host, log_file=LOG_FILE,
|
@contextlib.contextmanager
|
||||||
dots=True, use_db=True):
|
def vnu_servlet() -> Iterator[None]:
|
||||||
|
with subprocess.Popen([
|
||||||
|
'java', '-cp',
|
||||||
|
os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
'../node_modules/vnu-jar/build/dist/vnu.jar',
|
||||||
|
),
|
||||||
|
'nu.validator.servlet.Main',
|
||||||
|
'9988',
|
||||||
|
]) as proc:
|
||||||
|
yield
|
||||||
|
proc.terminate()
|
||||||
|
|
||||||
|
with vnu_servlet(), \
|
||||||
|
test_server_running(options.force, external_host, log_file=LOG_FILE,
|
||||||
|
dots=True, use_db=True):
|
||||||
ret_help_doc = subprocess.call(['scrapy', 'crawl_with_status'] + extra_args +
|
ret_help_doc = subprocess.call(['scrapy', 'crawl_with_status'] + extra_args +
|
||||||
['help_documentation_crawler'],
|
['help_documentation_crawler'],
|
||||||
cwd='tools/documentation_crawler')
|
cwd='tools/documentation_crawler')
|
||||||
|
|||||||
Reference in New Issue
Block a user