zulip/tools/convert-help-center-docs-to-mdx

#!/usr/bin/env python3

import os
import re
import shutil
import sys
from textwrap import indent

import django
from django.template import engines
from django.template.backends.jinja2 import Jinja2
from pydantic.alias_generators import to_pascal

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from scripts.lib.setup_path import setup_path

setup_path()

os.environ["DJANGO_SETTINGS_MODULE"] = "zproject.settings"
django.setup()

from zerver.lib.markdown.tabbed_sections import generate_content_blocks, parse_tabs

INDENT_SPACES = "    "


def replace_emoticon_translation_table(markdown_string: str, import_statement_set: set[str]) -> str:
    """
    We will replace emoticon_translations custom syntax in Python with
    <EmoticonTranslations> astro component.
    """
    result = markdown_string.replace(
        "\\{emoticon_translations\\}",
        """
<EmoticonTranslations />
""",
    )
    if result != markdown_string:
        import_statement_set.add(
            "import EmoticonTranslations from '../../components/EmoticonTranslations.astro';"
        )

    return result


def replace_image_path(markdown_string: str, replacement_path: str) -> str:
    """
    We will point to the existing image folder till
    the cutover. After that, we will copy the images
    to src folder for help-beta in order to take
    advantage of Astro's image optimization.
    See https://chat.zulip.org/#narrow/stream/6-frontend/topic/Handling.20images.20in.20help.20center.20starlight.20migration.2E/near/1915130
    """
    # We do not replace /static/images directly since there are a few
    # instances in the documentation where zulip.com links are
    # referenced with that blurb as a part of the url.
    result = markdown_string.replace("(/static/images/help-beta", f"({replacement_path}")
    return result.replace('="/static/images/help-beta', f'="{replacement_path}')


def fix_file_imports(
    markdown_string: str, import_statement_set: set[str], import_relative_base_path: str
) -> str:
    def convert_to_pascal(text: str) -> str:
        return to_pascal(text).replace("-", "").replace(".Md", "")

    def convert_to_astro_tag(match: re.Match[str]) -> str:
        return "<" + convert_to_pascal(match.group(1)) + " />"

    RE = re.compile(r"^ {,3}\{!([^!]+)!\} *$", re.MULTILINE)
    result = RE.sub(convert_to_astro_tag, markdown_string)
    matches = RE.findall(markdown_string)

    for match in matches:
        import_statement_set.add(
            f'import {convert_to_pascal(match)} from "{import_relative_base_path}/_{match}x"'
        )

    return result


def escape_curly_braces(markdown_string: str) -> str:
    """
    MDX will treat curly braces as a JS expression,
    we need to escape it if we don't want it to be
    treated as such.
    """
    result = markdown_string.replace("{", r"\{")
    return result.replace("}", r"\}")


def fix_relative_path(markdown_string: str) -> str:
    """
    Since the docs will live at the `help-beta/` url
    until we migrate the project completely, we will
    replace `help/` with `help-beta/`
    """
    return markdown_string.replace("help/", "help-beta/")


def insert_string_at_line(text: str, destination_str: str, n: int) -> str:
    lines = destination_str.splitlines()
    if 1 <= n <= len(lines):
        lines.insert(n - 1, text)
    return "\n".join(lines)


def replace_icon_with_unplugin_component(
    match: re.Match[str],
    icon_package_name: str,
    icon_component_prefix: str,
    import_statement_set: set[str],
) -> str:
    icon_name = match.group(1)
    component_name = icon_component_prefix + to_pascal(icon_name).replace("-", "")
    import_statement = f'import {component_name} from "~icons/{icon_package_name}/{icon_name}"'
    import_statement_set.add(import_statement)
    return f"<{component_name} />"


def replace_icons(markdown_string: str, import_statement_set: set[str]) -> str:
    """
    Write some examples here and some assumptions we made about
    the icon tags.
    """
    font_awesome_pattern = re.compile(
        r'<i[^>]*class="(?:[^"]*\s)?fa(?:\s+fa-([a-z0-9\-]+))(?:\s[^"]*)?"[^>]*>(?:\s[^<]*)?</i>',
    )

    def replace_font_awesome_icon_with_unplugin_component(match: re.Match[str]) -> str:
        return replace_icon_with_unplugin_component(match, "fa", "Fa", import_statement_set)

    result = re.sub(
        font_awesome_pattern, replace_font_awesome_icon_with_unplugin_component, markdown_string
    )

    zulip_icon_pattern = re.compile(
        r'<i[^>]*class="(?:[^"]*\s)?zulip-icon(?:\s+zulip-icon-([a-z0-9\-]+))(?:\s[^"]*)?"[^>]*>(?:\s[^<]*)?</i>',
    )

    def replace_zulip_icon_with_unplugin_component(match: re.Match[str]) -> str:
        return replace_icon_with_unplugin_component(
            match, "zulip-icon", "ZulipIcons", import_statement_set
        )

    result = re.sub(zulip_icon_pattern, replace_zulip_icon_with_unplugin_component, result)

    return result


def convert_tab_syntax(markdown_string: str, import_statement_set: set[str]) -> str:
    """
    Convert our custom tab syntax to relevant MDX equivalent.
    This function is inspired from `tabbed_section.py`'s run method.
    """
    TABBED_SECTION_TEMPLATE = """<Tabs>
{blocks}
</Tabs>"""

    TAB_ITEM_TEMPLATE = """<TabItem label="{tab_label}">
{content}
</TabItem>"""

    def indent_content_inside_blocks(content_block: str) -> str:
        """
        The original function `generate_content_blocks` does no
        indenting, and we need the content in between our TabItems
        to be indented. This is bit of string manipulation, but it
        is a better alternative compared to duplicating the original
        function here just to indent.
        """
        content_block_lines = content_block.splitlines()
        result_lines = []
        for line in content_block_lines:
            if line.startswith(("<TabItem", "</TabItem>")):
                result_lines.append(line)
            else:
                result_lines.append(INDENT_SPACES + line)

        return "\n".join(result_lines)

    lines = markdown_string.splitlines()
    tab_section = parse_tabs(lines)
    while tab_section:
        if "tabs" in tab_section:
            content_blocks = generate_content_blocks(tab_section, lines, TAB_ITEM_TEMPLATE)
            content_blocks = indent_content_inside_blocks(content_blocks)
            content_blocks = indent(content_blocks, INDENT_SPACES)
            tabs_mdx = TABBED_SECTION_TEMPLATE.format(blocks=content_blocks)
            import_statement_set.add(
                "import { Tabs, TabItem } from '@astrojs/starlight/components'"
            )
        else:
            # This is the case where we don't have any tabs but we were
            # using the `start_tabs` and `end_tabs` syntax to create
            # a border around the instructions. We just put the content
            # as is in this case since we don't want to add that border
            # anymore.
            tabs_mdx = ("\n").join(
                lines[(tab_section["start_tabs_index"] + 1) : (tab_section["end_tabs_index"])]
            )

        start = tab_section["start_tabs_index"]
        end = tab_section["end_tabs_index"] + 1
        lines = [*lines[:start], tabs_mdx, *lines[end:]]
        tab_section = parse_tabs(lines)

    return "\n".join(lines)


def detab(text: str) -> tuple[str, str]:
    """
    Remove a tab from the front of each line of the given text.
    Taken directly from
    https://github.com/Python-Markdown/markdown/blob/64a3c0fbc00327fbfee1fd6b44da0e5453287fe4/markdown/blockprocessors.py#L85
    We need this function for converting admonitions to asides, it is
    okay to be duplicating this code for this script.
    """
    tab_length = 4
    newtext = []
    lines = text.split("\n")
    for line in lines:
        if line.startswith(" " * tab_length):
            newtext.append(line[tab_length:])
        elif not line.strip():
            newtext.append("")
        else:
            break
    return "\n".join(newtext), "\n".join(lines[len(newtext) :])


def convert_admonitions_to_asides(
    markdown_string: str, import_statement_set: set[str], components_dir_path: str
) -> str:
    """
    Lots of code in this function is taken from
    https://github.com/Python-Markdown/markdown/blob/64a3c0fbc00327fbfee1fd6b44da0e5453287fe4/markdown/extensions/admonition.py
    """
    RE = re.compile(r'(?:^|\n)!!! ?([\w\-]+(?: +[\w\-]+)*)(?: +"(.*?)")? *(?:\n|$)')
    RE_SPACES = re.compile("  +")

    def get_admonition_class_and_title(match: re.Match[str]) -> tuple[str, str | None]:
        klass, title = match.group(1).lower(), match.group(2)
        klass = RE_SPACES.sub(" ", klass)
        if title is None:
            # no title was provided, use the capitalized class name as title
            title = klass.split(" ", 1)[0].capitalize()
        elif title == "":
            # an explicit blank title should not be rendered
            title = None
        return klass, title

    def replace_with_mdx_syntax(text: str) -> str:
        match = RE.search(text)
        if match:
            pre_admonition_declaration_text = text[: match.start()]
            post_admonition_declaration_text = text[match.end() :]  # removes the first line
            admonition_content, post_admonition_content_text = detab(
                post_admonition_declaration_text
            )
            admonition_content = indent(admonition_content, INDENT_SPACES)

            klass, title = get_admonition_class_and_title(match)
            # We ignore the title obtained above in each of the if
            # block since in our current help center files, we do not
            # specify the title anywhere. This script only handles cases
            # that exist in our help center files, nothing more than that
            # is handled.
            if klass == "warn":
                # We have converted `warn` to `note` since that was the
                # translation that remains most faithful to how we
                # display `warn` admonitions in our current help center
                # implementation.
                # See https://chat.zulip.org/#narrow/channel/19-documentation/topic/Stage.202.3A.20New.20syntax.20for.20!!!tip.20in.20help-beta/near/2174415
                # for more details.
                type = "note"
                title = ""
                replacement = (
                    f'\n<Aside type="{type}" title="{title}">\n{admonition_content}\n</Aside>\n'
                )
                import_statement_set.add("import { Aside } from '@astrojs/starlight/components'")
            elif klass == "tip":
                type = "tip"
                title = "Tip"
                replacement = (
                    f'\n<Aside type="{type}" title="{title}">\n{admonition_content}\n</Aside>\n'
                )
                import_statement_set.add("import { Aside } from '@astrojs/starlight/components'")
            elif klass == "keyboard_tip":
                replacement = f"\n<KeyboardTip>\n{admonition_content}\n</KeyboardTip>\n"
                import_statement_set.add(
                    f"import KeyboardTip from '{components_dir_path}/KeyboardTip.astro';"
                )
            else:
                raise Exception(f"Unexpected admonition class during conversion: {klass}")

            text = pre_admonition_declaration_text + replacement + post_admonition_content_text
            return replace_with_mdx_syntax(text)
        else:
            return text

    return replace_with_mdx_syntax(markdown_string)


def insert_imports(markdown_string: str, import_statement_set: set[str], line_number: int) -> str:
    if len(import_statement_set) == 0:
        return markdown_string

    # This function is called when the frontmatter has not yet been
    # inserted. First line of the file is always the heading/title of
    # the file. We rely on the heading being the first line later in
    # the conversion when inserting frontmatter. For this reason, we
    # add the imports to the second line.
    for import_statement in import_statement_set:
        markdown_string = insert_string_at_line(import_statement, markdown_string, line_number)

    # Add empty line at the end of import statement list.
    markdown_string = insert_string_at_line(
        "", markdown_string, line_number + len(import_statement_set)
    )
    return markdown_string


def insert_frontmatter(markdown_string: str) -> str:
    """
    We use the heading in the first line for the
    existing files to extract the document title.
    We are not adding a description to the frontmatter
    yet.
    """
    heading = markdown_string.partition("\n")[0].lstrip("#").strip()
    title = f"---\ntitle: {heading}\n---\n"
    # Remove the first line since starlight will display the
    # `title` as `H1` anyways.
    return title + markdown_string.split("\n", 1)[-1]


def get_markdown_string_from_file(markdown_file_path: str) -> str:
    jinja = engines["Jinja2"]
    assert isinstance(jinja, Jinja2)
    if markdown_file_path.startswith("/"):
        with open(markdown_file_path) as fp:
            return fp.read()

    return jinja.env.loader.get_source(jinja.env, markdown_file_path)[0]


def convert_help_center_file_to_mdx(
    markdown_file_path: str,
) -> str:
    """
    Given a path to a Markdown file, return the equivalent MDX file.
    """
    result = get_markdown_string_from_file(markdown_file_path)

    # All imports inserted during conversion should be tracked here.
    import_statement_set: set[str] = set()

    result = fix_file_imports(result, import_statement_set, "./include")
    result = convert_admonitions_to_asides(result, import_statement_set, "../../components")
    result = convert_tab_syntax(result, import_statement_set)
    result = escape_curly_braces(result)
    result = fix_relative_path(result)
    result = replace_emoticon_translation_table(result, import_statement_set)
    result = replace_image_path(result, "../../../../static/images/help")
    result = replace_icons(result, import_statement_set)
    result = insert_imports(result, import_statement_set, 2)
    result = insert_frontmatter(result)
    return result


def convert_include_file_to_mdx(
    markdown_file_path: str,
) -> str:
    """
    Given a path to a Markdown file, return the equivalent MDX file.
    We do not do certain operations that we do on a normal help file
    since these files are not to be served standalone but instead as
    macros in other files.
    - replace_emoticon_translation_table is skipped since that
    function only applies to one file, and that file is not an include
    file.
    - insert_frontmatter is skipped since frontmatter is not needed
    in files that are not served standalone.
    """
    result = get_markdown_string_from_file(markdown_file_path)

    # All imports inserted during conversion should be tracked here.
    import_statement_set: set[str] = set()

    result = fix_file_imports(result, import_statement_set, ".")
    result = convert_admonitions_to_asides(result, import_statement_set, "../../../components")
    result = convert_tab_syntax(result, import_statement_set)
    result = escape_curly_braces(result)
    result = fix_relative_path(result)
    result = replace_image_path(result, "../../../../../static/images/help")
    result = replace_icons(result, import_statement_set)
    result = insert_imports(result, import_statement_set, 1)
    return result


def run() -> None:
    input_dir = os.path.join(BASE_DIR, "help")
    output_dir = os.path.join(BASE_DIR, "help-beta/src/content/docs")
    print("Starting the conversion from MD to MDX...")

    converted_count = 0

    # We delete the directory first to remove any stale files
    # that might have been deleted in the `help` folder but
    # their converted mdx files stay around
    shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)
    for name in os.listdir(input_dir):
        if os.path.isfile(os.path.join(input_dir, name)):
            converted_count += 1
            mdx = convert_help_center_file_to_mdx(os.path.join(input_dir, name))
            with open(
                os.path.join(
                    output_dir,
                    os.path.basename(name).split(".")[0] + ".mdx",
                ),
                "w",
            ) as mdx_file:
                mdx_file.write(mdx)
    print(f"Converted {converted_count} files. Proceeding to the conversion of include files ...")

    include_converted_count = 0
    include_input_dir = os.path.join(input_dir, "include")
    include_output_dir = os.path.join(output_dir, "include")
    os.makedirs(include_output_dir, exist_ok=True)
    for name in os.listdir(include_input_dir):
        if os.path.isfile(os.path.join(include_input_dir, name)):
            include_converted_count += 1
            mdx = convert_include_file_to_mdx(os.path.join(include_input_dir, name))
            with open(
                os.path.join(
                    include_output_dir,
                    "_" + os.path.basename(name).split(".")[0] + ".mdx",
                ),
                "w",
            ) as mdx_file:
                mdx_file.write(mdx)
    print(f"Converted {include_converted_count} include files. Conversion completed.")


run()