Files
zulip/tools/lib/template_parser.py
Gordon P. Hemsley 3ffc6b5a35 Clean up how missing end tags are handled in template parsing.
The null token was an artificial placeholder that wasn't making clear what
the problem was. Throwing an exception is bolder.
2016-08-30 19:24:17 -04:00

411 lines
11 KiB
Python

from __future__ import absolute_import
from __future__ import print_function
from typing import Callable, Optional
from six.moves import range
import re
class TemplateParserException(Exception):
# TODO: Have callers pass in line numbers.
pass
class TokenizerState(object):
def __init__(self):
# type: () -> None
self.i = 0
self.line = 1
self.col = 1
class Token(object):
def __init__(self, kind, s, tag, line, col):
# type: (str, str, str, int, int) -> None
self.kind = kind
self.s = s
self.tag = tag
self.line = line
self.col = col
def tokenize(text):
def advance(n):
# type: (int) -> None
for _ in range(n):
state.i += 1
if state.i >= 0 and text[state.i - 1] == '\n':
state.line += 1
state.col = 1
else:
state.col += 1
def looking_at(s):
# type: (str) -> bool
return text[state.i:state.i+len(s)] == s
def looking_at_html_start():
# type: () -> bool
return looking_at("<") and not looking_at("</")
def looking_at_html_end():
# type: () -> bool
return looking_at("</")
def looking_at_handlebars_start():
# type: () -> bool
return looking_at("{{#") or looking_at("{{^")
def looking_at_handlebars_end():
# type: () -> bool
return looking_at("{{/")
def looking_at_django_start():
# type: () -> bool
return looking_at("{% ") and not looking_at("{% end")
def looking_at_django_end():
# type: () -> bool
return looking_at("{% end")
state = TokenizerState()
tokens = []
while state.i < len(text):
if looking_at_html_start():
s = get_html_tag(text, state.i)
tag_parts = s[1:-1].split()
if not tag_parts:
raise TemplateParserException("Tag name missing")
tag = tag_parts[0]
if is_special_html_tag(s, tag):
kind = 'html_special'
elif s.endswith('/>'):
kind = 'html_singleton'
else:
kind = 'html_start'
elif looking_at_html_end():
s = get_html_tag(text, state.i)
tag = s[2:-1]
kind = 'html_end'
elif looking_at_handlebars_start():
s = get_handlebars_tag(text, state.i)
tag = s[3:-2].split()[0]
kind = 'handlebars_start'
elif looking_at_handlebars_end():
s = get_handlebars_tag(text, state.i)
tag = s[3:-2]
kind = 'handlebars_end'
elif looking_at_django_start():
s = get_django_tag(text, state.i)
tag = s[3:-2].split()[0]
kind = 'django_start'
elif looking_at_django_end():
s = get_django_tag(text, state.i)
tag = s[6:-3]
kind = 'django_end'
else:
advance(1)
continue
token = Token(
kind=kind,
s=s,
tag=tag,
line=state.line,
col=state.col,
)
tokens.append(token)
advance(len(s))
return tokens
def validate(fn=None, text=None, check_indent=True):
# type: (str, str, bool) -> None
assert fn or text
if fn is None:
fn = '<in memory file>'
if text is None:
text = open(fn).read()
tokens = tokenize(text)
class State(object):
def __init__(self, func):
# type: (Callable[[Token], None]) -> None
self.depth = 0
self.matcher = func
def no_start_tag(token):
# type: (Token) -> None
raise TemplateParserException('''
No start tag
fn: %s
end tag:
%s
line %d, col %d
''' % (fn, token.tag, token.line, token.col))
state = State(no_start_tag)
def start_tag_matcher(start_token):
# type: (Token) -> None
state.depth += 1
start_tag = start_token.tag
start_line = start_token.line
start_col = start_token.col
old_matcher = state.matcher
def f(end_token):
# type: (Token) -> None
end_tag = end_token.tag
end_line = end_token.line
end_col = end_token.col
if start_tag == 'a':
max_lines = 3
else:
max_lines = 1
problem = None
if (start_tag == 'code') and (end_line == start_line + 1):
problem = 'Code tag is split across two lines.'
if start_tag != end_tag:
problem = 'Mismatched tag.'
elif check_indent and (end_line > start_line + max_lines):
if end_col != start_col:
problem = 'Bad indentation.'
if problem:
raise TemplateParserException('''
fn: %s
%s
start:
%s
line %d, col %d
end tag:
%s
line %d, col %d
''' % (fn, problem, start_token.s, start_line, start_col, end_tag, end_line, end_col))
state.matcher = old_matcher
state.depth -= 1
state.matcher = f
for token in tokens:
kind = token.kind
tag = token.tag
if kind == 'html_start':
start_tag_matcher(token)
elif kind == 'html_end':
state.matcher(token)
elif kind == 'handlebars_start':
start_tag_matcher(token)
elif kind == 'handlebars_end':
state.matcher(token)
elif kind == 'django_start':
if is_django_block_tag(tag):
start_tag_matcher(token)
elif kind == 'django_end':
state.matcher(token)
if state.depth != 0:
raise TemplateParserException('Missing end tag')
def is_special_html_tag(s, tag):
# type: (str, str) -> bool
return (s.startswith('<!--') or
tag in ['link', 'meta', '!DOCTYPE'])
def is_django_block_tag(tag):
# type: (str) -> bool
return tag in [
'autoescape',
'block',
'comment',
'for',
'if',
'ifequal',
'verbatim',
'blocktrans',
'trans',
'raw',
]
def get_handlebars_tag(text, i):
# type: (str, int) -> str
end = i + 2
while end < len(text) -1 and text[end] != '}':
end += 1
if text[end] != '}' or text[end+1] != '}':
raise TemplateParserException('Tag missing }}')
s = text[i:end+2]
return s
def get_django_tag(text, i):
# type: (str, int) -> str
end = i + 2
while end < len(text) -1 and text[end] != '%':
end += 1
if text[end] != '%' or text[end+1] != '}':
raise TemplateParserException('Tag missing %}')
s = text[i:end+2]
return s
def get_html_tag(text, i):
# type: (str, int) -> str
quote_count = 0
end = i + 1
while end < len(text) and (text[end] != '>' or quote_count % 2 != 0):
if text[end] == '"':
quote_count += 1
end += 1
if end == len(text) or text[end] != '>':
raise TemplateParserException('Tag missing >')
s = text[i:end+1]
return s
class Node(object):
def __init__(self, token, parent):
# type: (Token, Node) -> None
self.token = token
self.children = [] # type: List[Node]
self.parent = None # type: Optional[Node]
class TagInfo(object):
def __init__(self, tag, classes, ids, token):
# type: (str, List[str], List[str], Token) -> None
self.tag = tag
self.classes = classes
self.ids = ids
self.token = token
self.words = \
[self.tag] + \
['.' + s for s in classes] + \
['#' + s for s in ids]
def text(self):
# type: () -> str
s = self.tag
if self.classes:
s += '.' + '.'.join(self.classes)
if self.ids:
s += '#' + '#'.join(self.ids)
return s
def get_tag_info(token):
# type: (Token) -> TagInfo
s = token.s
tag = token.tag
classes = [] # type: List[str]
ids = [] # type: List[str]
searches = [
(classes, ' class="(.*?)"'),
(classes, " class='(.*?)'"),
(ids, ' id="(.*?)"'),
(ids, " id='(.*?)'"),
]
for lst, regex in searches:
m = re.search(regex, s)
if m:
for g in m.groups():
lst += g.split()
return TagInfo(tag=tag, classes=classes, ids=ids, token=token)
class HtmlTreeBranch(object):
'''
For <p><div id='yo'>bla<span class='bar'></span></div></p>, store a representation
of the tags all the way down to the leaf, which would
conceptually be something like "p div(#yo) span(.bar)".
'''
def __init__(self, tags, fn):
# type: (List[TagInfo], str) -> None
self.tags = tags
self.fn = fn
self.line = tags[-1].token.line
self.words = set() # type: Set[str]
for tag in tags:
for word in tag.words:
self.words.add(word)
def staircase_text(self):
# type: () -> str
'''
produces representation of a node in staircase-like format:
html
body.main-section
p#intro
'''
res = '\n'
indent = ' ' * 4
for t in self.tags:
res += indent + t.text() + '\n'
indent += ' ' * 4
return res
def text(self):
# type: () -> str
'''
produces one-line representation of branch:
html body.main-section p#intro
'''
return ' '.join(t.text() for t in self.tags)
def html_branches(fn):
# type: (str) -> List[HtmlTreeBranch]
text = open(fn).read()
tree = html_tag_tree(text)
branches = [] # type: List[HtmlTreeBranch]
def walk(node, tag_info_list=None):
# type: (Node, Optional[List[TagInfo]]) -> Node
info = get_tag_info(node.token)
if tag_info_list is None:
tag_info_list = [info]
else:
tag_info_list = tag_info_list[:] + [info]
if node.children:
for child in node.children:
walk(node=child, tag_info_list=tag_info_list)
else:
tree_branch = HtmlTreeBranch(tags=tag_info_list, fn=fn)
branches.append(tree_branch)
for node in tree.children:
walk(node, None)
return branches
def html_tag_tree(text):
# type: (str) -> Node
tokens = tokenize(text)
top_level = Node(token=None, parent=None)
stack = [top_level]
for token in tokens:
if token.kind in ('html_start', 'html_singleton'):
if not is_special_html_tag(token.s, token.tag):
parent = stack[-1]
node= Node(token=token, parent=parent)
parent.children.append(node)
if token.kind == 'html_start':
stack.append(node)
elif token.kind == 'html_end':
stack.pop()
return top_level