html_diff: Migrate to use lxml.

We were using Google's diff-match-patch library to diff HTML. The problem with that approach is that it is a text differ, not an HTML differ and so it ends up messing up the HTML tags. `lxml` is a safer option. Fixes: #7219.
2025-11-04 14:03:30 +00:00 · 2017-10-30 20:11:34 +00:00
parent c160c06f9c
commit c863bb83a0
2 changed files with 52 additions and 133 deletions
--- a/zerver/lib/html_diff.py
+++ b/zerver/lib/html_diff.py
@@ -1,138 +1,25 @@
 import lxml
-from typing import Callable, List, Optional, Tuple, Text
+from lxml.html.diff import htmldiff
 from typing import Optional, Text
-from django.conf import settings
+def highlight_with_class(text, klass):
 from diff_match_patch import diff_match_patch
 import platform
 import logging
 # TODO: handle changes in link hrefs
 def highlight_with_class(klass, text):
    # type: (Text, Text) -> Text
    return '<span class="%s">%s</span>' % (klass, text)
 def highlight_inserted(text):
    # type: (Text) -> Text
    return highlight_with_class('highlight_text_inserted', text)
 def highlight_deleted(text):
    # type: (Text) -> Text
    return highlight_with_class('highlight_text_deleted', text)
 def chunkize(text, in_tag):
    # type: (Text, bool) -> Tuple[List[Tuple[Text, Text]], bool]
    start = 0
    idx = 0
    chunks = []  # type: List[Tuple[Text, Text]]
    for c in text:
        if c == '<':
            in_tag = True
            if start != idx:
                chunks.append(('text', text[start:idx]))
            start = idx
        elif c == '>':
            in_tag = False
            if start != idx + 1:
                chunks.append(('tag', text[start:idx + 1]))
            start = idx + 1
        idx += 1
    if start != idx:
        chunks.append(('tag' if in_tag else 'text', text[start:idx]))
    return chunks, in_tag
 def highlight_chunks(chunks, highlight_func):
    # type: (List[Tuple[Text, Text]], Callable[[Text], Text]) -> Text
    retval = u''
    for type, text in chunks:
        if type == 'text':
            retval += highlight_func(text)
        else:
            retval += text
    return retval
 def verify_html(html):
    # type: (Text) -> bool
    # TODO: Actually parse the resulting HTML to ensure we don't
    # create mal-formed markup.  This is unfortunately hard because
    # we both want pretty strict parsing and we want to parse html5
    # fragments.  For now, we do a basic sanity check.
    in_tag = False
    for c in html:
        if c == '<':
            if in_tag:
                return False
            in_tag = True
        elif c == '>':
            if not in_tag:
                return False
            in_tag = False
    if in_tag:
        return False
    return True
 def check_tags(text):
    # type: (Text) -> Text
    # The current diffing algorithm produces malformed html when text is
    # added to existing new lines. This patch manually corrects that.
    in_tag = False
    if text.endswith('<'):
        text = text[:-1]
    for c in text:
        if c == '<':
            in_tag = True
        elif c == '>' and not in_tag:
            text = '<' + text
            break
    return text
 def highlight_html_differences(s1, s2, msg_id=None):
    # type: (Text, Text, Optional[int]) -> Text
-    differ = diff_match_patch()
+    retval = htmldiff(s1, s2)
-    ops = differ.diff_main(s1, s2)
+    fragment = lxml.html.fromstring(retval)  # type: ignore # https://github.com/python/typeshed/issues/525
    differ.diff_cleanupSemantic(ops)
    retval = u''
    in_tag = False
-    idx = 0
+    for elem in fragment.cssselect('del'):
-    while idx < len(ops):
+        elem.tag = 'span'
-        op, text = ops[idx]
+        elem.set('class', 'highlight_text_deleted')
        text = check_tags(text)
        if idx != 0:
            prev_op, prev_text = ops[idx - 1]
            prev_text = check_tags(prev_text)
            # Remove visual offset from editing newlines
            if '<p><br>' in text:
                text = text.replace('<p><br>', '<p>')
            elif prev_text.endswith('<p>') and text.startswith('<br>'):
                text = text[4:]
        if op == diff_match_patch.DIFF_DELETE:
            chunks, in_tag = chunkize(text, in_tag)
            retval += highlight_chunks(chunks, highlight_deleted)
        elif op == diff_match_patch.DIFF_INSERT:
            chunks, in_tag = chunkize(text, in_tag)
            retval += highlight_chunks(chunks, highlight_inserted)
        elif op == diff_match_patch.DIFF_EQUAL:
            chunks, in_tag = chunkize(text, in_tag)
            retval += text
        idx += 1
-    if not verify_html(retval):
+    for elem in fragment.cssselect('ins'):
-        from zerver.lib.actions import internal_send_message
+        elem.tag = 'span'
-        from zerver.models import get_system_bot
+        elem.set('class', 'highlight_text_inserted')
-        # Normally, one would just throw a JsonableError, but because
+    retval = lxml.html.tostring(fragment)   # type: ignore # https://github.com/python/typeshed/issues/525
        # we don't super trust this algorithm, it makes sense to
        # mostly report the error to the Zulip developers to debug.
        logging.getLogger('').error('HTML diff produced mal-formed HTML for message %s' % (msg_id,))
        if settings.ERROR_BOT is not None:
            subject = "HTML diff failure on %s" % (platform.node(),)
            realm = get_system_bot(settings.ERROR_BOT).realm
            internal_send_message(realm, settings.ERROR_BOT, "stream",
                                  "errors", subject, "HTML diff produced malformed HTML for message %s" % (msg_id,))
        return s2
    return retval
--- a/zerver/tests/test_messages.py
+++ b/zerver/tests/test_messages.py
@@ -1522,8 +1522,8 @@ class EditMessageTest(ZulipTestCase):
                         '<p>content after edit</p>')
        self.assertEqual(message_history_1[1]['content_html_diff'],
                         ('<p>content '
                          '<span class="highlight_text_inserted">after</span> '
                          '<span class="highlight_text_deleted">before</span>'
                          '<span class="highlight_text_inserted">after</span>'
                          ' edit</p>'))
        # Check content of message before edit.
        self.assertEqual(message_history_1[1]['prev_rendered_content'],
@@ -1559,16 +1559,48 @@ class EditMessageTest(ZulipTestCase):
                          'content after edit, line 2<br>\n'
                          'content before edit, line 3</p>'))
        self.assertEqual(message_history_2[1]['content_html_diff'],
-                         ('<p>content before edit, line 1</p>'
+                         ('<p>content before edit, line 1<br> '
-                          '<span class="highlight_text_deleted">\n'
+                          'content <span class="highlight_text_inserted">after edit, line 2<br> '
-                          '</span><p><span class="highlight_text_inserted">\n'
+                          'content</span> before edit, line 3</p>'))
                          'content after edit, line 2</span><br>'
                          '<span class="highlight_text_inserted">\n'
                          '</span>content before edit, line 3</p>'))
        self.assertEqual(message_history_2[1]['prev_rendered_content'],
                         ('<p>content before edit, line 1</p>\n'
                          '<p>content before edit, line 3</p>'))
    def test_edit_link(self):
        # type: () -> None
        # Link editing
        self.login(self.example_email("hamlet"))
        msg_id_1 = self.send_stream_message(
            self.example_email("hamlet"),
            "Scotland",
            topic_name="editing",
            content="Here is a link to [zulip](www.zulip.org).")
        new_content_1 = 'Here is a link to [zulip](www.zulipchat.com).'
        result_1 = self.client_patch("/json/messages/" + str(msg_id_1), {
            'message_id': msg_id_1, 'content': new_content_1
        })
        self.assert_json_success(result_1)
        message_edit_history_1 = self.client_get(
            "/json/messages/" + str(msg_id_1) + "/history")
        json_response_1 = ujson.loads(
            message_edit_history_1.content.decode('utf-8'))
        message_history_1 = json_response_1['message_history']
        # Check content of message after edit.
        self.assertEqual(message_history_1[0]['rendered_content'],
                         '<p>Here is a link to '
                         '<a href="http://www.zulip.org" target="_blank" title="http://www.zulip.org">zulip</a>.</p>')
        self.assertEqual(message_history_1[1]['rendered_content'],
                         '<p>Here is a link to '
                         '<a href="http://www.zulipchat.com" target="_blank" title="http://www.zulipchat.com">zulip</a>.</p>')
        self.assertEqual(message_history_1[1]['content_html_diff'],
                         ('<p>Here is a link to <a href="http://www.zulipchat.com" '
                          'target="_blank" title="http://www.zulipchat.com">zulip '
                          '<span class="highlight_text_inserted"> Link: http://www.zulipchat.com .'
                          '</span> <span class="highlight_text_deleted"> Link: http://www.zulip.org .'
                          '</span> </a></p>'))
    def test_user_info_for_updates(self):
        # type: () -> None
        hamlet = self.example_user('hamlet')