From 033351609d4d674f78d2d2d65cf308c0aa516226 Mon Sep 17 00:00:00 2001
From: Sumanth V Rao <sumanthvrao@gmail.com>
Date: Sun, 6 Sep 2020 12:11:37 +0530
Subject: [PATCH] markdown: Add data-codehilite-language attr for fenced code.

When converting fenced code markdown, we add the language (if specified)
in a data-attribute by tweaking the HTML generated. Doing so, allows the
frontend to make use of this attr to display view-in-playground option
for codeblocks.

We use pygments to get the lexer subclass name and use that instead of
directly using the language in the data-attribute. Doing so, helps us
map different language aliases (like `js` and `javascript`) into a common
variable (like `JavaScript`) - and avoids the client from dealing with
multiple tags corresponding to the same language.

The html structure for a message like this:

``` js
..content..
```

would now be:

<div class="codehilite" data-codehilite-language="JavaScript">
    <pre>..content..</pre>
</div>

Tests and fixtures amended.
---
 zerver/lib/markdown/fenced_code.py            | 21 +++++++++++++++++++
 .../tests/fixtures/markdown_test_cases.json   |  6 +++---
 zerver/tests/test_markdown.py                 |  4 +++-
 3 files changed, 27 insertions(+), 4 deletions(-)
diff --git a/zerver/lib/markdown/fenced_code.py b/zerver/lib/markdown/fenced_code.py
index 79edb00917..b1b1c76273 100644
--- a/zerver/lib/markdown/fenced_code.py
+++ b/zerver/lib/markdown/fenced_code.py
@@ -80,7 +80,10 @@ from typing import Any, Dict, Iterable, List, Mapping, MutableSequence, Optional
 
 import markdown
 from django.utils.html import escape
+from lxml import etree
 from markdown.extensions.codehilite import CodeHilite, CodeHiliteExtension
+from pygments.lexers import get_lexer_by_name
+from pygments.util import ClassNotFound
 
 from zerver.lib.exceptions import MarkdownRenderingException
 from zerver.lib.tex import render_tex
@@ -392,6 +395,24 @@ class FencedBlockPreprocessor(markdown.preprocessors.Preprocessor):
         else:
             code = CODE_WRAP.format(langclass, self._escape(text))
 
+        # In order to display a "view-in-playground" option in the frontend,
+        # we need to know the language used in the codeblock. We tweak the HTML
+        # CodeHilite generates to add this language as a data-attribute.
+        if lang:
+            parsed_code = etree.HTML(code)
+            div_tag = parsed_code[0][0]
+            # We get the lexer subclass name instead of directly processing the lang, to avoid
+            # different tags being generated for each of the lang's alias. Eg: `js` and `javascript`
+            # would now be mapped to `JavaScript`. In case no lexer with that alias is found, we
+            # return back the text, wrapped in a data-codehilite tag.
+            try:
+                lexer_subclass_name = get_lexer_by_name(lang).name
+            except ClassNotFound:
+                lexer_subclass_name = lang
+            div_tag.attrib['data-codehilite-language'] = lexer_subclass_name
+            # Lxml implicitly converts tags like <span></span> into <span/>
+            # specifying method="c14n" when converting to string, prevents that.
+            code = etree.tostring(div_tag, method="c14n").decode()
         return code
 
     def format_quote(self, text: str) -> str:
diff --git a/zerver/tests/fixtures/markdown_test_cases.json b/zerver/tests/fixtures/markdown_test_cases.json
index 5a66d952f2..8a8cb5c2e6 100644
--- a/zerver/tests/fixtures/markdown_test_cases.json
+++ b/zerver/tests/fixtures/markdown_test_cases.json
@@ -3,7 +3,7 @@
     {
       "name": "codeblock_hilite",
       "input": "Hamlet said:\n~~~~.python \ndef speak(self):\n    x = 1\n~~~~",
-      "expected_output": "<p>Hamlet said:</p>\n<div class=\"codehilite\"><pre><span></span><code><span class=\"k\">def</span> <span class=\"nf\">speak</span><span class=\"p\">(</span><span class=\"bp\">self</span><span class=\"p\">):</span>\n    <span class=\"n\">x</span> <span class=\"o\">=</span> <span class=\"mi\">1</span>\n</code></pre></div>",
+      "expected_output": "<p>Hamlet said:</p>\n<div class=\"codehilite\" data-codehilite-language=\"Python\"><pre><span></span><code><span class=\"k\">def</span> <span class=\"nf\">speak</span><span class=\"p\">(</span><span class=\"bp\">self</span><span class=\"p\">):</span>\n    <span class=\"n\">x</span> <span class=\"o\">=</span> <span class=\"mi\">1</span>\n</code></pre></div>",
       "marked_expected_output": "<p>Hamlet said:</p>\n<div class=\"codehilite\"><pre><span></span><code>def speak(self):\n    x = 1\n</code></pre></div>",
       "text_content": "Hamlet said:\ndef speak(self):\n    x = 1\n"
     },
@@ -786,13 +786,13 @@
     {
       "name": "tex_fenced_tex",
       "input": "```tex\n\n\\pi \\textbf{ is not } 3.14\n```",
-      "expected_output": "<div class=\"codehilite\"><pre><span></span><code><span class=\"k\">\\pi</span> <span class=\"k\">\\textbf</span><span class=\"nb\">{</span> is not <span class=\"nb\">}</span> 3.14\n</code></pre></div>",
+      "expected_output": "<div class=\"codehilite\" data-codehilite-language=\"TeX\"><pre><span></span><code><span class=\"k\">\\pi</span> <span class=\"k\">\\textbf</span><span class=\"nb\">{</span> is not <span class=\"nb\">}</span> 3.14\n</code></pre></div>",
       "marked_expected_output": "<div class=\"codehilite\"><pre><span></span><code>\\pi \\textbf{ is not } 3.14\n</code></pre></div>"
     },
     {
       "name": "tex_fenced_latex",
       "input": "```latex\n\n\\pi \\textbf{ is not } 3.14\n```",
-      "expected_output": "<div class=\"codehilite\"><pre><span></span><code><span class=\"k\">\\pi</span> <span class=\"k\">\\textbf</span><span class=\"nb\">{</span> is not <span class=\"nb\">}</span> 3.14\n</code></pre></div>",
+      "expected_output": "<div class=\"codehilite\" data-codehilite-language=\"TeX\"><pre><span></span><code><span class=\"k\">\\pi</span> <span class=\"k\">\\textbf</span><span class=\"nb\">{</span> is not <span class=\"nb\">}</span> 3.14\n</code></pre></div>",
       "marked_expected_output": "<div class=\"codehilite\"><pre><span></span><code>\\pi \\textbf{ is not } 3.14\n</code></pre></div>"
     },
     {
diff --git a/zerver/tests/test_markdown.py b/zerver/tests/test_markdown.py
index f0b0282d8a..13eadaa227 100644
--- a/zerver/tests/test_markdown.py
+++ b/zerver/tests/test_markdown.py
@@ -1376,6 +1376,7 @@ class MarkdownTest(ZulipTestCase):
         msg_without_language = markdown_convert_wrapper(text.format(''))
         msg_with_quote = markdown_convert_wrapper(text.format('quote'))
         msg_with_math = markdown_convert_wrapper(text.format('math'))
+        msg_with_none = markdown_convert_wrapper(text.format('none'))
 
         # Render with default=javascript
         do_set_realm_property(realm, 'default_code_block_language', 'javascript')
@@ -1403,7 +1404,8 @@ class MarkdownTest(ZulipTestCase):
         self.assertTrue(msg_with_python == msg_with_python_default_js == msg_without_language_default_py)
         self.assertTrue(msg_with_quote == msg_without_language_default_quote)
         self.assertTrue(msg_with_math == msg_without_language_default_math)
-        self.assertTrue(msg_without_language == msg_with_none_default_py == msg_without_language_final)
+        self.assertTrue(msg_without_language == msg_without_language_final)
+        self.assertTrue(msg_with_none == msg_with_none_default_py)
 
         # Test checking inside nested quotes
         nested_text = "````quote\n\n{}\n\n{}````".format(text.format('js'), text.format(''))