template_parser: Check for invalid combinations of HTML elements.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
Anders Kaseorg
2025-09-05 16:43:57 -07:00
committed by Tim Abbott
parent 9452e2cbb2
commit 8691c11933
2 changed files with 232 additions and 35 deletions

191
tools/lib/html_elements.py Normal file
View File

@@ -0,0 +1,191 @@
from collections.abc import Iterator
VALID_HTML_CONTEXTS: dict[tuple[str, str], str] = {
# https://html.spec.whatwg.org/multipage/indices.html#elements-3
("a", "phrasing"): "transparent",
("abbr", "phrasing"): "phrasing",
("address", "flow"): "flow",
("area", "phrasing"): "void",
("article", "flow"): "flow",
("aside", "flow"): "flow",
("audio", "phrasing"): "<audio>",
("b", "phrasing"): "phrasing",
("base", "<head>"): "void",
("bdi", "phrasing"): "phrasing",
("bdo", "phrasing"): "phrasing",
("blockquote", "flow"): "flow",
("body", "<html>"): "flow",
("br", "phrasing"): "void",
("button", "phrasing"): "phrasing",
("button", "<select>"): "phrasing",
("canvas", "phrasing"): "transparent",
("caption", "<table>"): "flow",
("center", "flow"): "flow", # FIXME: obsolete, remove this
("cite", "phrasing"): "phrasing",
("code", "phrasing"): "phrasing",
("col", "<colgroup>"): "void",
("colgroup", "<table>"): "<colgroup>",
("data", "phrasing"): "phrasing",
("datalist", "phrasing"): "<datalist>",
("dd", "<dl>"): "flow",
("del", "phrasing"): "transparent",
("details", "flow"): "<details>",
("dfn", "phrasing"): "phrasing",
("dialog", "flow"): "flow",
("div", "flow"): "flow",
("div", "<dl>"): "<dl>",
("div", "<select> content"): "<select> content",
("div", "<optgroup> content"): "<optgroup> content",
("div", "<option> content"): "<option> content",
("dl", "flow"): "<dl>",
("dt", "<dl>"): "phrasing",
("em", "phrasing"): "phrasing",
("embed", "phrasing"): "void",
("fieldset", "flow"): "<fieldset>",
("figcaption", "<figure>"): "flow",
("figure", "flow"): "<figure>",
("footer", "flow"): "flow",
("form", "flow"): "flow",
("h1", "plain heading"): "phrasing",
("h2", "plain heading"): "phrasing",
("h3", "plain heading"): "phrasing",
("h4", "plain heading"): "phrasing",
("h5", "plain heading"): "phrasing",
("h6", "plain heading"): "phrasing",
("head", "<html>"): "<head>",
("header", "flow"): "flow",
("hgroup", "heading"): "<hgroup>",
("hr", "flow"): "void",
("hr", "<select> content"): "void",
("html", "document"): "<html>",
("i", "phrasing"): "phrasing",
("iframe", "phrasing"): "empty",
("img", "phrasing"): "void",
("img", "<picture>"): "void",
("input", "phrasing"): "void",
("ins", "phrasing"): "transparent",
("kbd", "phrasing"): "phrasing",
("label", "phrasing"): "phrasing",
("legend", "<fieldset>"): "phrasing/heading",
("li", "list"): "flow",
("link", "<head>"): "void",
("link", "phrasing"): "void",
("main", "flow"): "flow",
("map", "phrasing"): "<map>",
("mark", "phrasing"): "phrasing",
("math", "phrasing"): "MathML",
("menu", "flow"): "list",
("meta", "<head>"): "void",
("meta", "phrasing"): "void",
("meter", "phrasing"): "phrasing",
("nav", "flow"): "flow",
("noscript", "<head>"): "transparent",
("noscript", "phrasing"): "transparent",
("noscript", "<select> content"): "transparent",
("noscript", "<optgroup> content"): "transparent",
("object", "phrasing"): "transparent",
("ol", "flow"): "list",
("optgroup", "<select>"): "<optgroup> content",
("option", "<select>"): "<option> content",
("option", "<datalist>"): "<option> content",
("option", "<optgroup>"): "<option> content",
("output", "phrasing"): "phrasing",
("p", "flow"): "phrasing",
("p", "<hgroup>"): "phrasing",
("picture", "phrasing"): "<picture>",
("pre", "flow"): "phrasing",
("progress", "phrasing"): "phrasing",
("q", "phrasing"): "phrasing",
("rp", "<ruby>"): "phrasing",
("rt", "<ruby>"): "phrasing",
("ruby", "phrasing"): "<ruby>",
("s", "phrasing"): "phrasing",
("samp", "phrasing"): "phrasing",
("script", "<head>"): "<script>",
("script", "phrasing"): "<script>",
("script", "script-supporting"): "<script>",
("search", "flow"): "flow",
("section", "flow"): "flow",
("select", "phrasing"): "<select>",
("selectedcontent", "<button>"): "empty",
("slot", "phrasing"): "transparent",
("small", "phrasing"): "phrasing",
("source", "<picture>"): "void",
("source", "<video>"): "void",
("source", "<audio>"): "void",
("span", "phrasing"): "phrasing",
("strong", "phrasing"): "phrasing",
("style", "<head>"): "<style>",
("sub", "phrasing"): "phrasing",
("summary", "<details>"): "phrasing/heading",
("sup", "phrasing"): "phrasing",
("svg", "phrasing"): "SVG",
("table", "flow"): "<table>",
("tbody", "<table>"): "<tbody>",
("td", "<tr>"): "flow",
("template", "<head>"): "unknown",
("template", "phrasing"): "unknown",
("template", "script-supporting"): "unknown",
("template", "<colgroup>"): "unknown",
("textarea", "phrasing"): "text",
("tfoot", "<table>"): "<tfoot>",
("th", "<tr>"): "flow",
("thead", "<table>"): "<thead>",
("time", "phrasing"): "phrasing",
("title", "<head>"): "text",
("tr", "<table>"): "<tr>",
("tr", "<thead>"): "<tr>",
("tr", "<tbody>"): "<tr>",
("tr", "<tfoot>"): "<tr>",
("track", "<audio>"): "void",
("track", "<video>"): "void",
("u", "phrasing"): "phrasing",
("ul", "flow"): "list",
("var", "phrasing"): "phrasing",
("video", "phrasing"): "<video>",
("wbr", "phrasing"): "void",
# https://html.spec.whatwg.org/multipage/embedded-content-other.html#mathml
("annotation-xml", "MathML"): "flow",
("mi", "MathML"): "phrasing",
("mo", "MathML"): "phrasing",
("mn", "MathML"): "phrasing",
("ms", "MathML"): "phrasing",
("mtext", "MathML"): "phrasing",
# https://html.spec.whatwg.org/multipage/embedded-content-other.html#svg-0
("foreignObject", "SVG"): "flow",
("title", "SVG"): "phrasing",
}
HTML_CONTEXT_FALLBACKS: dict[str, list[str]] = {
"<datalist>": ["phrasing", "script-supporting"],
"<details>": ["flow"],
"<dl>": ["script-supporting"],
"<fieldset>": ["flow"],
"<figure>": ["flow"],
"<hgroup>": ["plain heading", "script-supporting"],
"<optgroup> content": ["script-supporting"],
"<option> content": ["phrasing"],
"<option>": ["<option> content"],
"<picture>": ["script-supporting"],
"<ruby>": ["phrasing"],
"<select> content": ["script-supporting"],
"<select>": ["<select> content"],
"<table>": ["script-supporting"],
"<tbody>": ["script-supporting"],
"<tfoot>": ["script-supporting"],
"<thead>": ["script-supporting"],
"<tr>": ["script-supporting"],
"flow": ["phrasing", "heading"],
"heading": ["plain heading"],
"list": ["script-supporting"],
"phrasing/heading": ["phrasing", "heading"],
"unknown": ["document", "flow", "list", "<head>", "<select>", "<table>", "<tr>"],
}
FOREIGN_CONTEXTS = ["MathML", "SVG"]
def html_context_fallbacks(context: str) -> Iterator[str]:
yield context
for fallback_context in HTML_CONTEXT_FALLBACKS.get(context, []):
yield from html_context_fallbacks(fallback_context)

View File

@@ -2,6 +2,8 @@ from collections.abc import Callable
from typing_extensions import override from typing_extensions import override
from .html_elements import FOREIGN_CONTEXTS, VALID_HTML_CONTEXTS, html_context_fallbacks
class FormattedError(Exception): class FormattedError(Exception):
pass pass
@@ -277,25 +279,6 @@ def tokenize(text: str, template_format: str | None = None) -> list[Token]:
return tokens return tokens
HTML_VOID_TAGS = {
"area",
"base",
"br",
"col",
"command",
"embed",
"hr",
"img",
"input",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr",
}
# The following excludes some obscure tags that are never used # The following excludes some obscure tags that are never used
# in Zulip code. # in Zulip code.
HTML_INLINE_TAGS = { HTML_INLINE_TAGS = {
@@ -396,8 +379,8 @@ def validate(
class State: class State:
def __init__(self, func: Callable[[Token | None], None]) -> None: def __init__(self, func: Callable[[Token | None], None]) -> None:
self.depth = 0 self.depth = 0
self.foreign = False
self.matcher = func self.matcher = func
self.html_context = "unknown"
def no_start_tag(token: Token | None) -> None: def no_start_tag(token: Token | None) -> None:
assert token assert token
@@ -420,10 +403,7 @@ def validate(
start_col = start_token.col start_col = start_token.col
old_matcher = state.matcher old_matcher = state.matcher
old_foreign = state.foreign old_html_context = state.html_context
if start_tag in ["math", "svg"]:
state.foreign = True
def f(end_token: Token | None) -> None: def f(end_token: Token | None) -> None:
if end_token is None: if end_token is None:
@@ -478,7 +458,7 @@ def validate(
if not is_else_tag: if not is_else_tag:
state.matcher = old_matcher state.matcher = old_matcher
state.foreign = old_foreign state.html_context = old_html_context
state.depth -= 1 state.depth -= 1
# TODO: refine this for the else/elif use cases # TODO: refine this for the else/elif use cases
@@ -491,22 +471,48 @@ def validate(
kind = token.kind kind = token.kind
tag = token.tag tag = token.tag
if not state.foreign:
if kind == "html_start" and tag in HTML_VOID_TAGS:
raise TemplateParserError(
f"Tag must be self-closing: {tag} at {fn} line {token.line}, col {token.col}"
)
elif kind == "html_singleton" and tag not in HTML_VOID_TAGS:
raise TemplateParserError(
f"Tag must not be self-closing: {tag} at {fn} line {token.line}, col {token.col}"
)
flavor = tag_flavor(token) flavor = tag_flavor(token)
if flavor == "start": if flavor == "start":
start_tag_matcher(token) start_tag_matcher(token)
elif flavor == "end": elif flavor == "end":
state.matcher(token) state.matcher(token)
if kind in ("html_start", "html_singleton"):
for context in html_context_fallbacks(state.html_context):
if (tag, context) in VALID_HTML_CONTEXTS:
new_context = VALID_HTML_CONTEXTS[tag, context]
if new_context == "transparent":
new_context = state.html_context
break
else:
if "-" in tag and "phrasing" in html_context_fallbacks(state.html_context):
new_context = state.html_context # custom elements
elif state.html_context in FOREIGN_CONTEXTS:
new_context = state.html_context # unchecked foreign elements
else:
raise TemplateParserError(
f"<{tag}> is not valid in {state.html_context} context"
+ (
' (consider growing HTML_CONTEXT_FALLBACKS["unknown"]?)'
if state.html_context == "unknown"
else ""
)
+ " at {fn} line {token.line}, col {token.col}"
)
if new_context not in FOREIGN_CONTEXTS:
if kind == "html_start" and new_context == "void":
raise TemplateParserError(
f"Tag must be self-closing: {tag} at {fn} line {token.line}, col {token.col}"
)
elif kind == "html_singleton" and new_context != "void":
raise TemplateParserError(
f"Tag must not be self-closing: {tag} at {fn} line {token.line}, col {token.col}"
)
if kind == "html_start":
state.html_context = new_context
if state.depth != 0: if state.depth != 0:
state.matcher(None) state.matcher(None)