mirror of
				https://github.com/zulip/zulip.git
				synced 2025-11-04 05:53:43 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			199 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			199 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import re
 | 
						|
from collections import defaultdict
 | 
						|
from typing import Dict, List, Optional, Sequence, Set
 | 
						|
 | 
						|
from .template_parser import FormattedException, Token, tokenize
 | 
						|
 | 
						|
 | 
						|
class HtmlBranchesException(Exception):
 | 
						|
    # TODO: Have callers pass in line numbers.
 | 
						|
    pass
 | 
						|
 | 
						|
 | 
						|
class HtmlTreeBranch:
 | 
						|
    """
 | 
						|
    For <p><div id='yo'>bla<span class='bar'></span></div></p>, store a
 | 
						|
    representation of the tags all the way down to the leaf, which would
 | 
						|
    conceptually be something like "p div(#yo) span(.bar)".
 | 
						|
    """
 | 
						|
 | 
						|
    def __init__(self, tags: List["TagInfo"], fn: Optional[str]) -> None:
 | 
						|
        self.tags = tags
 | 
						|
        self.fn = fn
 | 
						|
        self.line = tags[-1].token.line
 | 
						|
 | 
						|
        self.words: Set[str] = set()
 | 
						|
        for tag in tags:
 | 
						|
            for word in tag.words:
 | 
						|
                self.words.add(word)
 | 
						|
 | 
						|
    def staircase_text(self) -> str:
 | 
						|
        """
 | 
						|
        produces representation of a node in staircase-like format:
 | 
						|
 | 
						|
            html
 | 
						|
                body.main-section
 | 
						|
                    p#intro
 | 
						|
 | 
						|
        """
 | 
						|
        res = "\n"
 | 
						|
        indent = " " * 4
 | 
						|
        for t in self.tags:
 | 
						|
            res += indent + t.text() + "\n"
 | 
						|
            indent += " " * 4
 | 
						|
        return res
 | 
						|
 | 
						|
    def text(self) -> str:
 | 
						|
        """
 | 
						|
        produces one-line representation of branch:
 | 
						|
 | 
						|
        html body.main-section p#intro
 | 
						|
        """
 | 
						|
        return " ".join(t.text() for t in self.tags)
 | 
						|
 | 
						|
 | 
						|
class Node:
 | 
						|
    def __init__(self, token: Optional[Token], parent: "Optional[Node]") -> None:
 | 
						|
        # FIXME parent parameter is not used!
 | 
						|
        self.token = token
 | 
						|
        self.children: List[Node] = []
 | 
						|
        self.parent: Optional[Node] = None
 | 
						|
 | 
						|
 | 
						|
class TagInfo:
 | 
						|
    def __init__(self, tag: str, classes: List[str], ids: List[str], token: Token) -> None:
 | 
						|
        self.tag = tag
 | 
						|
        self.classes = classes
 | 
						|
        self.ids = ids
 | 
						|
        self.token = token
 | 
						|
        self.words = [
 | 
						|
            self.tag,
 | 
						|
            *("." + s for s in classes),
 | 
						|
            *("#" + s for s in ids),
 | 
						|
        ]
 | 
						|
 | 
						|
    def text(self) -> str:
 | 
						|
        s = self.tag
 | 
						|
        if self.classes:
 | 
						|
            s += "." + ".".join(self.classes)
 | 
						|
        if self.ids:
 | 
						|
            s += "#" + "#".join(self.ids)
 | 
						|
        return s
 | 
						|
 | 
						|
 | 
						|
def get_tag_info(token: Token) -> TagInfo:
 | 
						|
    s = token.s
 | 
						|
    tag = token.tag
 | 
						|
    classes: List[str] = []
 | 
						|
    ids: List[str] = []
 | 
						|
 | 
						|
    searches = [
 | 
						|
        (classes, ' class="(.*?)"'),
 | 
						|
        (classes, " class='(.*?)'"),
 | 
						|
        (ids, ' id="(.*?)"'),
 | 
						|
        (ids, " id='(.*?)'"),
 | 
						|
    ]
 | 
						|
 | 
						|
    for lst, regex in searches:
 | 
						|
        m = re.search(regex, s)
 | 
						|
        if m:
 | 
						|
            for g in m.groups():
 | 
						|
                lst += split_for_id_and_class(g)
 | 
						|
 | 
						|
    return TagInfo(tag=tag, classes=classes, ids=ids, token=token)
 | 
						|
 | 
						|
 | 
						|
def split_for_id_and_class(element: str) -> List[str]:
 | 
						|
    # Here we split a given string which is expected to contain id or class
 | 
						|
    # attributes from HTML tags. This also takes care of template variables
 | 
						|
    # in string during splitting process. For eg. 'red black {{ a|b|c }}'
 | 
						|
    # is split as ['red', 'black', '{{ a|b|c }}']
 | 
						|
    outside_braces: bool = True
 | 
						|
    lst = []
 | 
						|
    s = ""
 | 
						|
 | 
						|
    for ch in element:
 | 
						|
        if ch == "{":
 | 
						|
            outside_braces = False
 | 
						|
        if ch == "}":
 | 
						|
            outside_braces = True
 | 
						|
        if ch == " " and outside_braces:
 | 
						|
            if not s == "":
 | 
						|
                lst.append(s)
 | 
						|
            s = ""
 | 
						|
        else:
 | 
						|
            s += ch
 | 
						|
    if not s == "":
 | 
						|
        lst.append(s)
 | 
						|
 | 
						|
    return lst
 | 
						|
 | 
						|
 | 
						|
def html_branches(text: str, fn: Optional[str] = None) -> List[HtmlTreeBranch]:
 | 
						|
    tree = html_tag_tree(text, fn)
 | 
						|
    branches: List[HtmlTreeBranch] = []
 | 
						|
 | 
						|
    def walk(node: Node, tag_info_list: Sequence[TagInfo] = []) -> None:
 | 
						|
        assert node.token is not None
 | 
						|
        info = get_tag_info(node.token)
 | 
						|
        tag_info_list = [*tag_info_list, info]
 | 
						|
 | 
						|
        if node.children:
 | 
						|
            for child in node.children:
 | 
						|
                walk(node=child, tag_info_list=tag_info_list)
 | 
						|
        else:
 | 
						|
            tree_branch = HtmlTreeBranch(tags=tag_info_list, fn=fn)
 | 
						|
            branches.append(tree_branch)
 | 
						|
 | 
						|
    for node in tree.children:
 | 
						|
        walk(node, [])
 | 
						|
 | 
						|
    return branches
 | 
						|
 | 
						|
 | 
						|
def html_tag_tree(text: str, fn: Optional[str] = None) -> Node:
 | 
						|
    tokens = tokenize(text)
 | 
						|
    top_level = Node(token=None, parent=None)
 | 
						|
    stack = [top_level]
 | 
						|
 | 
						|
    for token in tokens:
 | 
						|
        # Add tokens to the Node tree first (conditionally).
 | 
						|
        if token.kind in ("html_start", "html_singleton"):
 | 
						|
            parent = stack[-1]
 | 
						|
            node = Node(token=token, parent=parent)
 | 
						|
            parent.children.append(node)
 | 
						|
 | 
						|
        # Then update the stack to have the next node that
 | 
						|
        # we will be appending to at the top.
 | 
						|
        if token.kind == "html_start":
 | 
						|
            stack.append(node)
 | 
						|
        elif token.kind == "html_end":
 | 
						|
            stack.pop()
 | 
						|
 | 
						|
    return top_level
 | 
						|
 | 
						|
 | 
						|
def build_id_dict(templates: List[str]) -> (Dict[str, List[str]]):
 | 
						|
    template_id_dict: (Dict[str, List[str]]) = defaultdict(list)
 | 
						|
 | 
						|
    for fn in templates:
 | 
						|
        with open(fn) as f:
 | 
						|
            text = f.read()
 | 
						|
 | 
						|
        try:
 | 
						|
            list_tags = tokenize(text)
 | 
						|
        except FormattedException as e:
 | 
						|
            raise Exception(
 | 
						|
                f"""
 | 
						|
                fn: {fn}
 | 
						|
                {e}"""
 | 
						|
            )
 | 
						|
 | 
						|
        for tag in list_tags:
 | 
						|
            info = get_tag_info(tag)
 | 
						|
 | 
						|
            for ids in info.ids:
 | 
						|
                template_id_dict[ids].append("Line " + str(info.token.line) + ":" + fn)
 | 
						|
 | 
						|
    return template_id_dict
 |