mirror of
				https://github.com/zulip/zulip.git
				synced 2025-11-04 05:53:43 +00:00 
			
		
		
		
	This starts to address 1533. I still think the </p> tags should be on their own line lined up with the start tag, so the linter won't let through the specific example shown in the ticket.
		
			
				
	
	
		
			409 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			409 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from __future__ import absolute_import
 | 
						|
from __future__ import print_function
 | 
						|
from typing import Callable, Optional
 | 
						|
from six.moves import range
 | 
						|
import re
 | 
						|
 | 
						|
class TokenizerState(object):
 | 
						|
    def __init__(self):
 | 
						|
        # type: () -> None
 | 
						|
        self.i = 0
 | 
						|
        self.line = 1
 | 
						|
        self.col = 1
 | 
						|
 | 
						|
class Token(object):
 | 
						|
    def __init__(self, kind, s, tag, line, col):
 | 
						|
        # type: (str, str, str, int, int) -> None
 | 
						|
        self.kind = kind
 | 
						|
        self.s = s
 | 
						|
        self.tag = tag
 | 
						|
        self.line = line
 | 
						|
        self.col = col
 | 
						|
 | 
						|
def tokenize(text):
 | 
						|
    def advance(n):
 | 
						|
        # type: (int) -> None
 | 
						|
        for _ in range(n):
 | 
						|
            state.i += 1
 | 
						|
            if state.i >= 0 and text[state.i - 1] == '\n':
 | 
						|
                state.line += 1
 | 
						|
                state.col = 1
 | 
						|
            else:
 | 
						|
                state.col += 1
 | 
						|
 | 
						|
    def looking_at(s):
 | 
						|
        # type: (str) -> bool
 | 
						|
        return text[state.i:state.i+len(s)] == s
 | 
						|
 | 
						|
    def looking_at_html_start():
 | 
						|
        # type: () -> bool
 | 
						|
        return looking_at("<") and not looking_at("</")
 | 
						|
 | 
						|
    def looking_at_html_end():
 | 
						|
        # type: () -> bool
 | 
						|
        return looking_at("</")
 | 
						|
 | 
						|
    def looking_at_handlebars_start():
 | 
						|
        # type: () -> bool
 | 
						|
        return looking_at("{{#") or looking_at("{{^")
 | 
						|
 | 
						|
    def looking_at_handlebars_end():
 | 
						|
        # type: () -> bool
 | 
						|
        return looking_at("{{/")
 | 
						|
 | 
						|
    def looking_at_django_start():
 | 
						|
        # type: () -> bool
 | 
						|
        return looking_at("{% ") and not looking_at("{% end")
 | 
						|
 | 
						|
    def looking_at_django_end():
 | 
						|
        # type: () -> bool
 | 
						|
        return looking_at("{% end")
 | 
						|
 | 
						|
    state = TokenizerState()
 | 
						|
    tokens = []
 | 
						|
 | 
						|
    while state.i < len(text):
 | 
						|
        if looking_at_html_start():
 | 
						|
            s = get_html_tag(text, state.i)
 | 
						|
            tag = s[1:-1].split()[0]
 | 
						|
            if is_special_html_tag(s, tag):
 | 
						|
                kind = 'html_special'
 | 
						|
            elif s.endswith('/>'):
 | 
						|
                kind = 'html_singleton'
 | 
						|
            else:
 | 
						|
                kind = 'html_start'
 | 
						|
        elif looking_at_html_end():
 | 
						|
            s = get_html_tag(text, state.i)
 | 
						|
            tag = s[2:-1]
 | 
						|
            kind = 'html_end'
 | 
						|
        elif looking_at_handlebars_start():
 | 
						|
            s = get_handlebars_tag(text, state.i)
 | 
						|
            tag = s[3:-2].split()[0]
 | 
						|
            kind = 'handlebars_start'
 | 
						|
        elif looking_at_handlebars_end():
 | 
						|
            s = get_handlebars_tag(text, state.i)
 | 
						|
            tag = s[3:-2]
 | 
						|
            kind = 'handlebars_end'
 | 
						|
        elif looking_at_django_start():
 | 
						|
            s = get_django_tag(text, state.i)
 | 
						|
            tag = s[3:-2].split()[0]
 | 
						|
            kind = 'django_start'
 | 
						|
        elif looking_at_django_end():
 | 
						|
            s = get_django_tag(text, state.i)
 | 
						|
            tag = s[6:-3]
 | 
						|
            kind = 'django_end'
 | 
						|
        else:
 | 
						|
            advance(1)
 | 
						|
            continue
 | 
						|
 | 
						|
        token = Token(
 | 
						|
            kind=kind,
 | 
						|
            s=s,
 | 
						|
            tag=tag,
 | 
						|
            line=state.line,
 | 
						|
            col=state.col,
 | 
						|
        )
 | 
						|
        tokens.append(token)
 | 
						|
        advance(len(s))
 | 
						|
 | 
						|
    return tokens
 | 
						|
 | 
						|
def validate(fn=None, text=None, check_indent=True):
 | 
						|
    # type: (str, str, bool) -> None
 | 
						|
    assert fn or text
 | 
						|
 | 
						|
    if fn is None:
 | 
						|
        fn = '<in memory file>'
 | 
						|
 | 
						|
    if text is None:
 | 
						|
        text = open(fn).read()
 | 
						|
 | 
						|
    tokens = tokenize(text)
 | 
						|
 | 
						|
    class State(object):
 | 
						|
        def __init__(self, func):
 | 
						|
            # type: (Callable[[Token], None]) -> None
 | 
						|
            self.depth = 0
 | 
						|
            self.matcher = func
 | 
						|
 | 
						|
    def no_start_tag(token):
 | 
						|
        # type: (Token) -> None
 | 
						|
        raise Exception('''
 | 
						|
            No start tag
 | 
						|
            fn: %s
 | 
						|
            end tag:
 | 
						|
                %s
 | 
						|
                line %d, col %d
 | 
						|
            ''' % (fn, token.tag, token.line, token.col))
 | 
						|
 | 
						|
    state = State(no_start_tag)
 | 
						|
 | 
						|
    def start_tag_matcher(start_token):
 | 
						|
        # type: (Token) -> None
 | 
						|
        state.depth += 1
 | 
						|
        start_tag = start_token.tag
 | 
						|
        start_line = start_token.line
 | 
						|
        start_col = start_token.col
 | 
						|
 | 
						|
        old_matcher = state.matcher
 | 
						|
        def f(end_token):
 | 
						|
            # type: (Token) -> None
 | 
						|
 | 
						|
            end_tag = end_token.tag
 | 
						|
            end_line = end_token.line
 | 
						|
            end_col = end_token.col
 | 
						|
 | 
						|
            if start_tag == 'a':
 | 
						|
                max_lines = 3
 | 
						|
            else:
 | 
						|
                max_lines = 1
 | 
						|
 | 
						|
            problem = None
 | 
						|
            if (start_tag == 'code') and (end_line == start_line + 1):
 | 
						|
                problem = 'Code tag is split across two lines.'
 | 
						|
            if start_tag != end_tag:
 | 
						|
                problem = 'Mismatched tag.'
 | 
						|
            elif check_indent and (end_line > start_line + max_lines):
 | 
						|
                if end_col != start_col:
 | 
						|
                    problem = 'Bad indentation.'
 | 
						|
            if problem:
 | 
						|
                raise Exception('''
 | 
						|
                    fn: %s
 | 
						|
                    %s
 | 
						|
                    start:
 | 
						|
                        %s
 | 
						|
                        line %d, col %d
 | 
						|
                    end tag:
 | 
						|
                        %s
 | 
						|
                        line %d, col %d
 | 
						|
                    ''' % (fn, problem, start_token.s, start_line, start_col, end_tag, end_line, end_col))
 | 
						|
            state.matcher = old_matcher
 | 
						|
            state.depth -= 1
 | 
						|
        state.matcher = f
 | 
						|
 | 
						|
    for token in tokens:
 | 
						|
        kind = token.kind
 | 
						|
        tag = token.tag
 | 
						|
 | 
						|
        if kind == 'html_start':
 | 
						|
            start_tag_matcher(token)
 | 
						|
        elif kind == 'html_end':
 | 
						|
            state.matcher(token)
 | 
						|
 | 
						|
        elif kind == 'handlebars_start':
 | 
						|
            start_tag_matcher(token)
 | 
						|
        elif kind == 'handlebars_end':
 | 
						|
            state.matcher(token)
 | 
						|
 | 
						|
        elif kind == 'django_start':
 | 
						|
            if is_django_block_tag(tag):
 | 
						|
                start_tag_matcher(token)
 | 
						|
        elif kind == 'django_end':
 | 
						|
            state.matcher(token)
 | 
						|
 | 
						|
    null_token = Token(
 | 
						|
        kind=None,
 | 
						|
        s='(NO TAG)',
 | 
						|
        tag='NO TAG',
 | 
						|
        line=0,
 | 
						|
        col=0,
 | 
						|
    )
 | 
						|
 | 
						|
    if state.depth != 0:
 | 
						|
        state.matcher(null_token)
 | 
						|
 | 
						|
def is_special_html_tag(s, tag):
 | 
						|
    # type: (str, str) -> bool
 | 
						|
    return (s.startswith('<!--') or
 | 
						|
           tag in ['link', 'meta', '!DOCTYPE'])
 | 
						|
 | 
						|
def is_django_block_tag(tag):
 | 
						|
    # type: (str) -> bool
 | 
						|
    return tag in [
 | 
						|
        'autoescape',
 | 
						|
        'block',
 | 
						|
        'comment',
 | 
						|
        'for',
 | 
						|
        'if',
 | 
						|
        'ifequal',
 | 
						|
        'verbatim',
 | 
						|
        'blocktrans',
 | 
						|
        'trans',
 | 
						|
        'raw',
 | 
						|
    ]
 | 
						|
 | 
						|
def get_handlebars_tag(text, i):
 | 
						|
    # type: (str, int) -> str
 | 
						|
    end = i + 2
 | 
						|
    while end < len(text) -1 and text[end] != '}':
 | 
						|
        end += 1
 | 
						|
    if text[end] != '}' or text[end+1] != '}':
 | 
						|
        raise Exception('Tag missing }}')
 | 
						|
    s = text[i:end+2]
 | 
						|
    return s
 | 
						|
 | 
						|
def get_django_tag(text, i):
 | 
						|
    # type: (str, int) -> str
 | 
						|
    end = i + 2
 | 
						|
    while end < len(text) -1 and text[end] != '%':
 | 
						|
        end += 1
 | 
						|
    if text[end] != '%' or text[end+1] != '}':
 | 
						|
        raise Exception('Tag missing %}')
 | 
						|
    s = text[i:end+2]
 | 
						|
    return s
 | 
						|
 | 
						|
def get_html_tag(text, i):
 | 
						|
    # type: (str, int) -> str
 | 
						|
    quote_count = 0
 | 
						|
    end = i + 1
 | 
						|
    while end < len(text) and (text[end] != '>' or quote_count % 2 != 0):
 | 
						|
        if text[end] == '"':
 | 
						|
            quote_count += 1
 | 
						|
        end += 1
 | 
						|
    if end == len(text) or text[end] != '>':
 | 
						|
        raise Exception('Tag missing >')
 | 
						|
    s = text[i:end+1]
 | 
						|
    return s
 | 
						|
 | 
						|
class Node(object):
 | 
						|
    def __init__(self, token, parent):
 | 
						|
        # type: (Token, Node) -> None
 | 
						|
        self.token = token
 | 
						|
        self.children = [] # type: List[Node]
 | 
						|
        self.parent = None # type: Optional[Node]
 | 
						|
 | 
						|
class TagInfo(object):
 | 
						|
    def __init__(self, tag, classes, ids, token):
 | 
						|
        # type: (str, List[str], List[str], Token) -> None
 | 
						|
        self.tag = tag
 | 
						|
        self.classes = classes
 | 
						|
        self.ids = ids
 | 
						|
        self.token = token
 | 
						|
        self.words = \
 | 
						|
            [self.tag] + \
 | 
						|
            ['.' + s for s in classes] + \
 | 
						|
            ['#' + s for s in ids]
 | 
						|
 | 
						|
    def text(self):
 | 
						|
        # type: () -> str
 | 
						|
        s = self.tag
 | 
						|
        if self.classes:
 | 
						|
            s += '.' + '.'.join(self.classes)
 | 
						|
        if self.ids:
 | 
						|
            s += '#' + '#'.join(self.ids)
 | 
						|
        return s
 | 
						|
 | 
						|
def get_tag_info(token):
 | 
						|
    # type: (Token) -> TagInfo
 | 
						|
    s = token.s
 | 
						|
    tag = token.tag
 | 
						|
    classes = [] # type: List[str]
 | 
						|
    ids = [] # type: List[str]
 | 
						|
 | 
						|
    searches = [
 | 
						|
        (classes, ' class="(.*?)"'),
 | 
						|
        (classes, " class='(.*?)'"),
 | 
						|
        (ids, ' id="(.*?)"'),
 | 
						|
        (ids, " id='(.*?)'"),
 | 
						|
    ]
 | 
						|
 | 
						|
    for lst, regex in searches:
 | 
						|
        m = re.search(regex, s)
 | 
						|
        if m:
 | 
						|
            for g in m.groups():
 | 
						|
                lst += g.split()
 | 
						|
 | 
						|
    return TagInfo(tag=tag, classes=classes, ids=ids, token=token)
 | 
						|
 | 
						|
class HtmlTreeBranch(object):
 | 
						|
    '''
 | 
						|
    For <p><div id='yo'>bla<span class='bar'></span></div></p>, store a representation
 | 
						|
    of the tags all the way down to the leaf, which would
 | 
						|
    conceptually be something like "p div(#yo) span(.bar)".
 | 
						|
    '''
 | 
						|
 | 
						|
    def __init__(self, tags, fn):
 | 
						|
        # type: (List[TagInfo], str) -> None
 | 
						|
        self.tags = tags
 | 
						|
        self.fn = fn
 | 
						|
        self.line = tags[-1].token.line
 | 
						|
 | 
						|
        self.words = set() # type: Set[str]
 | 
						|
        for tag in tags:
 | 
						|
            for word in tag.words:
 | 
						|
                self.words.add(word)
 | 
						|
 | 
						|
    def staircase_text(self):
 | 
						|
        # type: () -> str
 | 
						|
        '''
 | 
						|
        produces representation of a node in staircase-like format:
 | 
						|
 | 
						|
            html
 | 
						|
                body.main-section
 | 
						|
                    p#intro
 | 
						|
 | 
						|
        '''
 | 
						|
        res = '\n'
 | 
						|
        indent = ' ' * 4
 | 
						|
        for t in self.tags:
 | 
						|
            res += indent + t.text() + '\n'
 | 
						|
            indent += ' ' * 4
 | 
						|
        return res
 | 
						|
 | 
						|
    def text(self):
 | 
						|
        # type: () -> str
 | 
						|
        '''
 | 
						|
        produces one-line representation of branch:
 | 
						|
 | 
						|
        html body.main-section p#intro
 | 
						|
        '''
 | 
						|
        return ' '.join(t.text() for t in self.tags)
 | 
						|
 | 
						|
def html_branches(fn):
 | 
						|
    # type: (str) -> List[HtmlTreeBranch]
 | 
						|
 | 
						|
    text = open(fn).read()
 | 
						|
    tree = html_tag_tree(text)
 | 
						|
    branches = [] # type: List[HtmlTreeBranch]
 | 
						|
 | 
						|
    def walk(node, tag_info_list=None):
 | 
						|
        # type: (Node, Optional[List[TagInfo]]) -> Node
 | 
						|
 | 
						|
        info = get_tag_info(node.token)
 | 
						|
        if tag_info_list is None:
 | 
						|
            tag_info_list = [info]
 | 
						|
        else:
 | 
						|
            tag_info_list = tag_info_list[:] + [info]
 | 
						|
 | 
						|
        if node.children:
 | 
						|
            for child in node.children:
 | 
						|
                walk(node=child, tag_info_list=tag_info_list)
 | 
						|
        else:
 | 
						|
            tree_branch = HtmlTreeBranch(tags=tag_info_list, fn=fn)
 | 
						|
            branches.append(tree_branch)
 | 
						|
 | 
						|
    for node in tree.children:
 | 
						|
        walk(node, None)
 | 
						|
 | 
						|
    return branches
 | 
						|
 | 
						|
def html_tag_tree(text):
 | 
						|
    # type: (str) -> Node
 | 
						|
    tokens = tokenize(text)
 | 
						|
    top_level = Node(token=None, parent=None)
 | 
						|
    stack = [top_level]
 | 
						|
 | 
						|
    for token in tokens:
 | 
						|
        if token.kind in ('html_start', 'html_singleton'):
 | 
						|
            if not is_special_html_tag(token.s, token.tag):
 | 
						|
                parent = stack[-1]
 | 
						|
                node= Node(token=token, parent=parent)
 | 
						|
                parent.children.append(node)
 | 
						|
            if token.kind == 'html_start':
 | 
						|
                stack.append(node)
 | 
						|
        elif token.kind == 'html_end':
 | 
						|
            stack.pop()
 | 
						|
 | 
						|
    return top_level
 | 
						|
 |