Files
zulip/tools/lib/template_parser.py
Steve Howell c4b181a169 check-templates: Insist on 4-space indents.
We now complain about programmers who don't use
4-space indents in template files, rather than
letting the pretty printer fix them.

This is partly just to simplify the pretty printer
code (in future commits), but it also makes the
symptom more obvious to newbie developers. They
are probably just as able to react to the direct
error messages as they are able to figure out how
to read diffs from the pretty printer and grok
the --fix syntax. And once they learn the convention
and configure their editor, it should then be a
one time problem.
2021-12-02 09:49:10 -08:00

662 lines
20 KiB
Python

from typing import Callable, List, Optional
class FormattedException(Exception):
pass
class TemplateParserException(Exception):
def __init__(self, message: str) -> None:
self.message = message
def __str__(self) -> str:
return self.message
class TokenizationException(Exception):
def __init__(self, message: str, line_content: Optional[str] = None) -> None:
self.message = message
self.line_content = line_content
class TokenizerState:
def __init__(self) -> None:
self.i = 0
self.line = 1
self.col = 1
class Token:
def __init__(self, kind: str, s: str, tag: str, line: int, col: int, line_span: int) -> None:
self.kind = kind
self.s = s
self.tag = tag
self.line = line
self.col = col
self.line_span = line_span
def tokenize(text: str) -> List[Token]:
def advance(n: int) -> None:
for _ in range(n):
state.i += 1
if state.i >= 0 and text[state.i - 1] == "\n":
state.line += 1
state.col = 1
else:
state.col += 1
def looking_at(s: str) -> bool:
return text[state.i : state.i + len(s)] == s
def looking_at_htmlcomment() -> bool:
return looking_at("<!--")
def looking_at_handlebarcomment() -> bool:
return looking_at("{{!")
def looking_at_djangocomment() -> bool:
return looking_at("{#")
def looking_at_handlebarpartial() -> bool:
return looking_at("{{>")
def looking_at_html_start() -> bool:
return looking_at("<") and not looking_at("</")
def looking_at_html_end() -> bool:
return looking_at("</")
def looking_at_handlebars_start() -> bool:
return looking_at("{{#") or looking_at("{{^") or looking_at("{{~#")
def looking_at_handlebars_else() -> bool:
return looking_at("{{else")
def looking_at_template_var() -> bool:
return looking_at("{")
def looking_at_handlebars_end() -> bool:
return looking_at("{{/") or looking_at("{{~/")
def looking_at_django_start() -> bool:
return looking_at("{% ")
def looking_at_django_else() -> bool:
return looking_at("{% else") or looking_at("{% elif")
def looking_at_django_end() -> bool:
return looking_at("{% end")
def looking_at_jinja2_end_whitespace_stripped() -> bool:
return looking_at("{%- end")
def looking_at_jinja2_start_whitespace_stripped_type2() -> bool:
# This function detects tag like {%- if foo -%}...{% endif %}
return looking_at("{%-") and not looking_at("{%- end")
def looking_at_whitespace() -> bool:
return looking_at("\n") or looking_at(" ")
state = TokenizerState()
tokens: List[Token] = []
while state.i < len(text):
try:
if looking_at_htmlcomment():
s = get_html_comment(text, state.i)
tag = s[4:-3]
kind = "html_comment"
elif looking_at_handlebarcomment():
s = get_handlebar_comment(text, state.i)
tag = s[3:-2]
kind = "handlebar_comment"
elif looking_at_djangocomment():
s = get_django_comment(text, state.i)
tag = s[2:-2]
kind = "django_comment"
elif looking_at_handlebarpartial():
s = get_handlebar_partial(text, state.i)
tag = s[9:-2]
kind = "handlebars_singleton"
elif looking_at_html_start():
s = get_html_tag(text, state.i)
if s.endswith("/>"):
end_offset = -2
else:
end_offset = -1
tag_parts = s[1:end_offset].split()
if not tag_parts:
raise TemplateParserException("Tag name missing")
tag = tag_parts[0]
if tag == "!DOCTYPE":
kind = "html_doctype"
elif s.endswith("/>"):
kind = "html_singleton"
else:
kind = "html_start"
elif looking_at_html_end():
s = get_html_tag(text, state.i)
tag = s[2:-1]
kind = "html_end"
elif looking_at_handlebars_else():
s = get_handlebars_tag(text, state.i)
tag = "else"
kind = "handlebars_else"
elif looking_at_handlebars_start():
s = get_handlebars_tag(text, state.i)
tag = s[3:-2].split()[0].strip("#")
if tag.startswith("*"):
tag = tag[1:]
kind = "handlebars_start"
elif looking_at_handlebars_end():
s = get_handlebars_tag(text, state.i)
tag = s[3:-2].strip("/#~")
kind = "handlebars_end"
elif looking_at_django_else():
s = get_django_tag(text, state.i)
tag = "else"
kind = "django_else"
elif looking_at_django_end():
s = get_django_tag(text, state.i)
tag = s[6:-3]
kind = "django_end"
elif looking_at_django_start():
# must check this after end/else
s = get_django_tag(text, state.i)
tag = s[3:-2].split()[0]
kind = "django_start"
if s[-3] == "-":
kind = "jinja2_whitespace_stripped_start"
elif looking_at_jinja2_end_whitespace_stripped():
s = get_django_tag(text, state.i)
tag = s[7:-3]
kind = "jinja2_whitespace_stripped_end"
elif looking_at_jinja2_start_whitespace_stripped_type2():
s = get_django_tag(text, state.i, stripped=True)
tag = s[3:-3].split()[0]
kind = "jinja2_whitespace_stripped_type2_start"
elif looking_at_template_var():
# order is important here
s = get_template_var(text, state.i)
tag = "var"
kind = "template_var"
elif looking_at("\n"):
s = "\n"
tag = "newline"
kind = "newline"
elif looking_at(" "):
s = get_spaces(text, state.i)
tag = ""
if not tokens or tokens[-1].kind == "newline":
kind = "indent"
else:
kind = "whitespace"
elif text[state.i] in "{<":
snippet = text[state.i :][:15]
raise AssertionError(f"tool cannot parse {snippet}")
else:
s = get_text(text, state.i)
if s == "":
continue
tag = ""
kind = "text"
except TokenizationException as e:
raise FormattedException(
f'''{e.message} at line {state.line} col {state.col}:"{e.line_content}"''',
)
line_span = len(s.strip("\n").split("\n"))
token = Token(
kind=kind,
s=s,
tag=tag.strip(),
line=state.line,
col=state.col,
line_span=line_span,
)
tokens.append(token)
advance(len(s))
return tokens
HTML_VOID_TAGS = {
"area",
"base",
"br",
"col",
"command",
"embed",
"hr",
"img",
"input",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr",
}
# The following excludes some obscure tags that are never used
# in Zulip code.
HTML_INLINE_TAGS = {
"a",
"b",
"br",
"button",
"cite",
"code",
"em",
"i",
"img",
"input",
"kbd",
"label",
"object",
"script",
"select",
"small",
"span",
"strong",
"textarea",
}
def validate(fn: Optional[str] = None, text: Optional[str] = None) -> None:
assert fn or text
if fn is None:
fn = "<in memory file>"
if text is None:
with open(fn) as f:
text = f.read()
lines = text.split("\n")
try:
tokens = tokenize(text)
except FormattedException as e:
raise TemplateParserException(
f"""
fn: {fn}
{e}"""
)
prevent_whitespace_violations(fn, tokens)
prevent_dangling_tags(fn, tokens)
class State:
def __init__(self, func: Callable[[Token], None]) -> None:
self.depth = 0
self.foreign = False
self.matcher = func
def no_start_tag(token: Token) -> None:
raise TemplateParserException(
f"""
No start tag
fn: {fn}
end tag:
{token.tag}
line {token.line}, col {token.col}
"""
)
state = State(no_start_tag)
def start_tag_matcher(start_token: Token) -> None:
state.depth += 1
start_tag = start_token.tag.strip("~")
start_line = start_token.line
start_col = start_token.col
old_matcher = state.matcher
old_foreign = state.foreign
if start_tag in ["math", "svg"]:
state.foreign = True
def f(end_token: Token) -> None:
is_else_tag = end_token.tag == "else"
end_tag = end_token.tag.strip("~")
end_line = end_token.line
end_col = end_token.col
is_inline_tag = start_tag in HTML_INLINE_TAGS and start_token.kind == "html_start"
def report_problem() -> Optional[str]:
if (start_tag == "code") and (end_line == start_line + 1):
return "Code tag is split across two lines."
if is_else_tag:
# We are not completely rigorous about having a sensible
# order of if/elif/elif/else, but we catch obviously
# mismatching else tags.
if start_tag not in ("if", "else", "unless"):
return f"Unexpected else/elif tag encountered after {start_tag} tag."
elif start_tag != end_tag:
return f"Mismatched tags: ({start_tag} != {end_tag})"
if end_line > start_line + 1:
if is_inline_tag:
end_row_text = lines[end_line - 1]
if end_row_text.lstrip().startswith(end_token.s):
if end_col != start_col:
return "Indentation for start/end tags does not match."
else:
if end_col != start_col:
return "Indentation for start/end tags does not match."
return None
problem = report_problem()
if problem:
raise TemplateParserException(
f"""
fn: {fn}
{problem}
start:
{start_token.s}
line {start_line}, col {start_col}
end tag:
{end_tag}
line {end_line}, col {end_col}
"""
)
if not is_else_tag:
state.matcher = old_matcher
state.foreign = old_foreign
state.depth -= 1
state.matcher = f
for token in tokens:
kind = token.kind
tag = token.tag
if kind in (
"django_comment",
"handlebar_comment",
"handlebars_singleton",
"indent",
"template_var",
"html_comment",
"html_doctype",
"newline",
"text",
"whitespace",
):
continue
if kind == "html_start":
if not state.foreign and tag in HTML_VOID_TAGS:
raise TemplateParserException(
f"Tag must be self-closing: {tag} at {fn} line {token.line}, col {token.col}"
)
start_tag_matcher(token)
elif kind == "html_singleton":
if not state.foreign and tag not in HTML_VOID_TAGS:
raise TemplateParserException(
f"Tag must not be self-closing: {tag} at {fn} line {token.line}, col {token.col}"
)
elif kind == "html_end":
state.matcher(token)
elif kind == "handlebars_start":
start_tag_matcher(token)
elif kind == "handlebars_else":
state.matcher(token)
elif kind == "handlebars_end":
state.matcher(token)
elif kind in {
"django_start",
"django_else",
"jinja2_whitespace_stripped_start",
"jinja2_whitespace_stripped_type2_start",
}:
if is_django_block_tag(tag):
start_tag_matcher(token)
elif kind in {"django_else", "django_end", "jinja2_whitespace_stripped_end"}:
state.matcher(token)
else:
raise AssertionError(f"tools programmer neglected to handle {kind} tokens")
if state.depth != 0:
raise TemplateParserException("Missing end tag")
def prevent_whitespace_violations(fn: str, tokens: List[Token]) -> None:
if tokens[0].kind in ("indent", "whitespace"):
raise TemplateParserException(f" Please remove the whitespace at the beginning of {fn}.")
for i in range(1, len(tokens) - 1):
token = tokens[i]
next_token = tokens[i + 1]
if token.kind == "indent":
if next_token.kind in ("indent", "whitespace"):
raise AssertionError("programming error parsing indents")
if next_token.kind == "newline":
raise TemplateParserException(
f"""Please just make row {token.line} in {fn} a truly blank line (no spaces)."""
)
if len(token.s) % 4 != 0:
raise TemplateParserException(
f"""
Please use 4-space indents for template files. Most of our
codebase (including Python and JavaScript) uses 4-space indents,
so it's worth investing in configuring your editor to use
4-space indents for files like
{fn}
The line at row {token.line} is indented with {len(token.s)} spaces.
"""
)
if token.kind == "whitespace":
if len(token.s) > 1:
raise TemplateParserException(
f"""
We did not expect this much whitespace at row {token.line} column {token.col} in {fn}.
"""
)
if next_token.kind == "newline":
raise TemplateParserException(
f"""
Unexpected trailing whitespace at row {token.line} column {token.col} in {fn}.
"""
)
def prevent_dangling_tags(fn: str, tokens: List[Token]) -> None:
"""
Prevent this kind of HTML:
<div attr attr
attr attr>Stuff</div>
We prefer:
<div attr attr
attr attr>
Stuff
</div>
We may eventually have the pretty_printer code do this
automatically, but there are some complications with
legacy code.
"""
min_row: Optional[int] = None
for token in tokens:
# We only apply this validation for a couple tag types, because
# our existing templates may have some funny edge cases. We eventually
# want to be more aggressive here. We may need to be extra careful
# with tags like <pre> that have whitespace sensitivities.
if token.tag not in ("div", "button", "p"):
continue
if min_row and token.line < min_row:
raise TemplateParserException(
f"""
Please fix line {token.line} at {fn} (col {token.col})
by moving this tag so that it closes the block at the
same indentation level as its start tag:
{token.s}
"""
)
else:
min_row = None
if token.line_span > 1:
min_row = token.line + token.line_span
def is_django_block_tag(tag: str) -> bool:
return tag in [
"autoescape",
"block",
"comment",
"for",
"if",
"ifequal",
"macro",
"verbatim",
"blocktrans",
"trans",
"raw",
"with",
]
def get_handlebars_tag(text: str, i: int) -> str:
end = i + 2
while end < len(text) - 1 and text[end] != "}":
end += 1
if text[end] != "}" or text[end + 1] != "}":
raise TokenizationException('Tag missing "}}"', text[i : end + 2])
s = text[i : end + 2]
return s
def get_spaces(text: str, i: int) -> str:
s = ""
while i < len(text) and text[i] in " ":
s += text[i]
i += 1
return s
def get_text(text: str, i: int) -> str:
s = ""
while i < len(text) and text[i] not in "{<":
s += text[i]
i += 1
return s.strip()
def get_django_tag(text: str, i: int, stripped: bool = False) -> str:
end = i + 2
if stripped:
end += 1
while end < len(text) - 1 and text[end] != "%":
end += 1
if text[end] != "%" or text[end + 1] != "}":
raise TokenizationException('Tag missing "%}"', text[i : end + 2])
s = text[i : end + 2]
return s
def get_html_tag(text: str, i: int) -> str:
quote_count = 0
end = i + 1
unclosed_end = 0
while end < len(text) and (text[end] != ">" or quote_count % 2 != 0 and text[end] != "<"):
if text[end] == '"':
quote_count += 1
if not unclosed_end and text[end] == "<":
unclosed_end = end
end += 1
if quote_count % 2 != 0:
if unclosed_end:
raise TokenizationException("Unbalanced quotes", text[i:unclosed_end])
else:
raise TokenizationException("Unbalanced quotes", text[i : end + 1])
if end == len(text) or text[end] != ">":
raise TokenizationException('Tag missing ">"', text[i : end + 1])
s = text[i : end + 1]
return s
def get_html_comment(text: str, i: int) -> str:
end = i + 7
unclosed_end = 0
while end <= len(text):
if text[end - 3 : end] == "-->":
return text[i:end]
if not unclosed_end and text[end] == "<":
unclosed_end = end
end += 1
raise TokenizationException("Unclosed comment", text[i:unclosed_end])
def get_handlebar_comment(text: str, i: int) -> str:
end = i + 5
unclosed_end = 0
while end <= len(text):
if text[end - 2 : end] == "}}":
return text[i:end]
if not unclosed_end and text[end] == "<":
unclosed_end = end
end += 1
raise TokenizationException("Unclosed comment", text[i:unclosed_end])
def get_template_var(text: str, i: int) -> str:
end = i + 3
unclosed_end = 0
while end <= len(text):
if text[end - 1] == "}":
if end < len(text) and text[end] == "}":
end += 1
return text[i:end]
if not unclosed_end and text[end] == "<":
unclosed_end = end
end += 1
raise TokenizationException("Unclosed var", text[i:unclosed_end])
def get_django_comment(text: str, i: int) -> str:
end = i + 4
unclosed_end = 0
while end <= len(text):
if text[end - 2 : end] == "#}":
return text[i:end]
if not unclosed_end and text[end] == "<":
unclosed_end = end
end += 1
raise TokenizationException("Unclosed comment", text[i:unclosed_end])
def get_handlebar_partial(text: str, i: int) -> str:
end = i + 10
unclosed_end = 0
while end <= len(text):
if text[end - 2 : end] == "}}":
return text[i:end]
if not unclosed_end and text[end] == "<":
unclosed_end = end
end += 1
raise TokenizationException("Unclosed partial", text[i:unclosed_end])