mirror of
				https://github.com/zulip/zulip.git
				synced 2025-11-04 05:53:43 +00:00 
			
		
		
		
	test-help-documentation: Validate HTML with vnu.jar.
The VNU_IGNORE whitelist lets in some crazy-invalid preexisting HTML, but hopefully this will stop the problem from getting much larger. Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
This commit is contained in:
		
				
					committed by
					
						
						Tim Abbott
					
				
			
			
				
	
			
			
			
						parent
						
							f407a12ba2
						
					
				
				
					commit
					2d50dcf7cc
				
			@@ -1,3 +1,4 @@
 | 
			
		||||
import json
 | 
			
		||||
import re
 | 
			
		||||
import scrapy
 | 
			
		||||
 | 
			
		||||
@@ -24,6 +25,17 @@ EXCLUDED_URLS = [
 | 
			
		||||
    'https://www.linkedin.com/company/zulip-project',
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
VNU_IGNORE = re.compile(r'|'.join([
 | 
			
		||||
    # Real errors that should be fixed.
 | 
			
		||||
    r'Duplicate ID “[^”]*”\.',
 | 
			
		||||
    r'The first occurrence of ID “[^”]*” was here\.',
 | 
			
		||||
    r'Attribute “markdown” not allowed on element “div” at this point\.',
 | 
			
		||||
    r'No “p” element in scope but a “p” end tag seen\.',
 | 
			
		||||
 | 
			
		||||
    # Warnings that are probably less important.
 | 
			
		||||
    r'The “type” attribute is unnecessary for JavaScript resources\.',
 | 
			
		||||
]))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BaseDocumentationSpider(scrapy.Spider):
 | 
			
		||||
    name = None  # type: Optional[str]
 | 
			
		||||
@@ -68,6 +80,24 @@ class BaseDocumentationSpider(scrapy.Spider):
 | 
			
		||||
            self.logger.error(
 | 
			
		||||
                "Fragment #%s is not found on page %s", fragment, response.request.url)
 | 
			
		||||
 | 
			
		||||
    def _vnu_callback(self, url: str) -> Callable[[Response], None]:
 | 
			
		||||
        def callback(response: Response) -> None:
 | 
			
		||||
            vnu_out = json.loads(response.text)
 | 
			
		||||
            for message in vnu_out['messages']:
 | 
			
		||||
                if not VNU_IGNORE.fullmatch(message['message']):
 | 
			
		||||
                    self.logger.error(
 | 
			
		||||
                        '"%s":%d.%d-%d.%d: %s: %s',
 | 
			
		||||
                        url,
 | 
			
		||||
                        message.get('firstLine', message['lastLine']),
 | 
			
		||||
                        message.get('firstColumn', message['lastColumn']),
 | 
			
		||||
                        message['lastLine'],
 | 
			
		||||
                        message['lastColumn'],
 | 
			
		||||
                        message['type'],
 | 
			
		||||
                        message['message'],
 | 
			
		||||
                    )
 | 
			
		||||
 | 
			
		||||
        return callback
 | 
			
		||||
 | 
			
		||||
    def _make_requests(self, url: str) -> Iterable[Request]:
 | 
			
		||||
        callback = self.parse  # type: Callable[[Response], Optional[Iterable[Request]]]
 | 
			
		||||
        dont_filter = False
 | 
			
		||||
@@ -89,6 +119,17 @@ class BaseDocumentationSpider(scrapy.Spider):
 | 
			
		||||
 | 
			
		||||
    def parse(self, response: Response) -> Iterable[Request]:
 | 
			
		||||
        self.log(response)
 | 
			
		||||
 | 
			
		||||
        if getattr(self, 'validate_html', False):
 | 
			
		||||
            yield Request(
 | 
			
		||||
                'http://localhost:9988/?out=json',
 | 
			
		||||
                method='POST',
 | 
			
		||||
                headers={'Content-Type': response.headers['Content-Type']},
 | 
			
		||||
                body=response.body,
 | 
			
		||||
                callback=self._vnu_callback(response.url),
 | 
			
		||||
                errback=self.error_callback,
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        for link in LxmlLinkExtractor(deny_domains=self.deny_domains, deny_extensions=['doc'],
 | 
			
		||||
                                      tags=self.tags, attrs=self.attrs, deny=self.deny,
 | 
			
		||||
                                      canonicalize=False).extract_links(response):
 | 
			
		||||
 
 | 
			
		||||
@@ -1,9 +1,10 @@
 | 
			
		||||
#!/usr/bin/env python3
 | 
			
		||||
import argparse
 | 
			
		||||
import contextlib
 | 
			
		||||
import os
 | 
			
		||||
import sys
 | 
			
		||||
import subprocess
 | 
			
		||||
from typing import List
 | 
			
		||||
from typing import Iterator
 | 
			
		||||
 | 
			
		||||
ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 | 
			
		||||
 | 
			
		||||
@@ -30,13 +31,28 @@ os.makedirs('var/help-documentation', exist_ok=True)
 | 
			
		||||
LOG_FILE = 'var/help-documentation/server.log'
 | 
			
		||||
external_host = "localhost:9981"
 | 
			
		||||
 | 
			
		||||
extra_args = []  # type: List[str]
 | 
			
		||||
extra_args = ['-a', 'validate_html=set']
 | 
			
		||||
 | 
			
		||||
if options.skip_external_link_check:
 | 
			
		||||
    extra_args = ['-a', 'skip_external=set']
 | 
			
		||||
    extra_args += ['-a', 'skip_external=set']
 | 
			
		||||
 | 
			
		||||
with test_server_running(options.force, external_host, log_file=LOG_FILE,
 | 
			
		||||
                         dots=True, use_db=True):
 | 
			
		||||
@contextlib.contextmanager
 | 
			
		||||
def vnu_servlet() -> Iterator[None]:
 | 
			
		||||
    with subprocess.Popen([
 | 
			
		||||
        'java', '-cp',
 | 
			
		||||
        os.path.join(
 | 
			
		||||
            os.path.dirname(__file__),
 | 
			
		||||
            '../node_modules/vnu-jar/build/dist/vnu.jar',
 | 
			
		||||
        ),
 | 
			
		||||
        'nu.validator.servlet.Main',
 | 
			
		||||
        '9988',
 | 
			
		||||
    ]) as proc:
 | 
			
		||||
        yield
 | 
			
		||||
        proc.terminate()
 | 
			
		||||
 | 
			
		||||
with vnu_servlet(), \
 | 
			
		||||
    test_server_running(options.force, external_host, log_file=LOG_FILE,
 | 
			
		||||
                        dots=True, use_db=True):
 | 
			
		||||
    ret_help_doc = subprocess.call(['scrapy', 'crawl_with_status'] + extra_args +
 | 
			
		||||
                                   ['help_documentation_crawler'],
 | 
			
		||||
                                   cwd='tools/documentation_crawler')
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user