zulip/tools/convert-help-center-docs-to-mdx

#!/usr/bin/env python3

import os
import re
import shutil
import sys

import django
from django.template import engines
from django.template.backends.jinja2 import Jinja2
from pydantic.alias_generators import to_pascal

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from scripts.lib.setup_path import setup_path

setup_path()

os.environ["DJANGO_SETTINGS_MODULE"] = "zproject.settings"
django.setup()


def replace_emoticon_translation_table(markdown_string: str, import_statement_set: set[str]) -> str:
    """
    We will replace emoticon_translations custom syntax in Python with
    <EmoticonTranslations> astro component.
    """
    result = markdown_string.replace(
        "\\{emoticon_translations\\}",
        """
<EmoticonTranslations />
""",
    )
    if result != markdown_string:
        import_statement_set.add(
            "import EmoticonTranslations from '../../components/EmoticonTranslations.astro';"
        )

    return result


def replace_image_path(markdown_string: str) -> str:
    """
    We will point to the existing image folder till
    the cutover. After that, we will copy the images
    to src folder for help-beta in order to take
    advantage of Astro's image optimization.
    See https://chat.zulip.org/#narrow/stream/6-frontend/topic/Handling.20images.20in.20help.20center.20starlight.20migration.2E/near/1915130
    """
    # We do not replace /static/images directly since there are a few
    # instances in the documentation where zulip.com links are
    # referenced with that blurb as a part of the url.
    result = markdown_string.replace("(/static/images/help-beta", "(../../../../static/images/help")
    return result.replace('="/static/images/help-beta', '="../../../../static/images/help')


def fix_file_imports(markdown_string: str, import_statement_set: set[str]) -> str:
    def convert_to_pascal(text: str) -> str:
        return to_pascal(text).replace("-", "").replace(".Md", "")

    def convert_to_astro_tag(match: re.Match[str]) -> str:
        return "<" + convert_to_pascal(match.group(1)) + " />"

    RE = re.compile(r"^ {,3}\{!([^!]+)!\} *$", re.MULTILINE)
    result = RE.sub(convert_to_astro_tag, markdown_string)
    matches = RE.findall(markdown_string)

    for match in matches:
        import_statement_set.add(f'import {convert_to_pascal(match)} from "./include/_{match}"')

    return result


def escape_curly_braces(markdown_string: str) -> str:
    """
    MDX will treat curly braces as a JS expression,
    we need to escape it if we don't want it to be
    treated as such.
    """
    result = markdown_string.replace("{", r"\{")
    return result.replace("}", r"\}")


def fix_relative_path(markdown_string: str) -> str:
    """
    Since the docs will live at the `help-beta/` url
    until we migrate the project completely, we will
    replace `help/` with `help-beta/`
    """
    return markdown_string.replace("help/", "help-beta/")


def append_str_to_line(text: str, destination_str: str, n: int) -> str:
    lines = destination_str.splitlines()
    if 1 <= n <= len(lines):
        lines[n - 1] += "\n" + text
    return "\n".join(lines)


def replace_icons(markdown_string: str, import_statement_set: set[str]) -> str:
    """
    Write some examples here and some assumptions we made about
    the icon tags.
    """
    font_awesome_pattern = re.compile(
        r'<i[^>]*class="(?:[^"]*\s)?fa(?:\s+fa-([a-z0-9\-]+))(?:\s[^"]*)?"[^>]*>(?:\s[^<]*)?</i>',
    )

    def replace_font_awesome_icon_with_unplugin_component(match: re.Match[str]) -> str:
        icon_name = match.group(1)
        component_name = "Fa" + to_pascal(icon_name).replace("-", "")
        import_statement = f'import {component_name} from "~icons/fa/{icon_name}"'
        import_statement_set.add(import_statement)
        return f"<{component_name} />"

    result = re.sub(
        font_awesome_pattern, replace_font_awesome_icon_with_unplugin_component, markdown_string
    )

    return result


def insert_imports(markdown_string: str, import_statement_set: set[str]) -> str:
    if len(import_statement_set) == 0:
        return markdown_string

    # This function is called when the frontmatter has not yet been
    # inserted. First line of the file is always the heading/title of
    # the file. We rely on the heading being the first line later in
    # the conversion when inserting frontmatter. For this reason, we
    # add the imports to the second line.
    for import_statement in import_statement_set:
        markdown_string = append_str_to_line(import_statement, markdown_string, 2)

    # Add empty line at the end of import statement list.
    markdown_string = append_str_to_line("", markdown_string, 2 + len(import_statement_set))
    return markdown_string


def insert_frontmatter(markdown_string: str) -> str:
    """
    We use the heading in the first line for the
    existing files to extract the document title.
    We are not adding a description to the frontmatter
    yet.
    """
    heading = markdown_string.partition("\n")[0].lstrip("#").strip()
    title = f"---\ntitle: {heading}\n---\n"
    # Remove the first line since starlight will display the
    # `title` as `H1` anyways.
    return title + markdown_string.split("\n", 1)[-1]


def convert_string_to_mdx(markdown_string: str) -> str:
    # All imports inserted during conversion should be tracked here.
    import_statement_set: set[str] = set()

    result = markdown_string
    result = fix_file_imports(result, import_statement_set)
    result = escape_curly_braces(result)
    result = fix_relative_path(result)
    result = replace_emoticon_translation_table(result, import_statement_set)
    result = replace_image_path(result)
    result = replace_icons(result, import_statement_set)
    result = insert_imports(result, import_statement_set)
    result = insert_frontmatter(result)
    return result


def convert_file_to_mdx(
    markdown_file_path: str,
) -> str:
    """
    Given a path to a Markdown file, return the equivalent MDX file.
    """
    jinja = engines["Jinja2"]
    assert isinstance(jinja, Jinja2)
    if markdown_file_path.startswith("/"):
        with open(markdown_file_path) as fp:
            markdown_string = fp.read()
    else:
        markdown_string = jinja.env.loader.get_source(jinja.env, markdown_file_path)[0]

    return convert_string_to_mdx(markdown_string)


def run() -> None:
    input_dir = os.path.join(BASE_DIR, "help")
    output_dir = os.path.join(BASE_DIR, "help-beta/src/content/docs")
    print("Starting the conversion from MD to MDX...")

    converted_count = 0

    # We delete the directory first to remove any stale files
    # that might have been deleted in the `help` folder but
    # their converted mdx files stay around
    shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)
    for name in os.listdir(input_dir):
        if os.path.isfile(os.path.join(input_dir, name)):
            converted_count += 1
            mdx = convert_file_to_mdx(os.path.join(input_dir, name))
            with open(
                os.path.join(
                    BASE_DIR,
                    output_dir,
                    os.path.basename(name).split(".")[0] + ".mdx",
                ),
                "w",
            ) as mdx_file:
                mdx_file.write(mdx)
    print(f"Converted {converted_count} files. Conversion completed.")

    # All files in the `include` folder will only be imports and not
    # standalone files. Therefore we do not do any manipulation or
    # them to mdx.
    include_source_dir = os.path.join(BASE_DIR, "help/include")
    include_destination_dir = os.path.join(BASE_DIR, "help-beta/src/content/docs/include")
    shutil.copytree(include_source_dir, include_destination_dir)

    # We do not want Astro to render these include files as standalone
    # files, prefixing them with an underscore accomplishes that.
    # https://docs.astro.build/en/guides/routing/#excluding-pages
    for name in os.listdir(include_destination_dir):
        os.rename(
            os.path.join(include_destination_dir, name),
            os.path.join(include_destination_dir, "_" + name),
        )


run()