html_diff: Migrate to use lxml.

We were using Google's diff-match-patch library to diff HTML. The
problem with that approach is that it is a text differ, not an HTML
differ and so it ends up messing up the HTML tags. `lxml` is a safer
option.

Fixes: #7219.
This commit is contained in:
Harshit Bansal
2017-10-30 20:11:34 +00:00
committed by Tim Abbott
parent c160c06f9c
commit c863bb83a0
2 changed files with 52 additions and 133 deletions

View File

@@ -1,138 +1,25 @@
import lxml
from typing import Callable, List, Optional, Tuple, Text from lxml.html.diff import htmldiff
from typing import Optional, Text
from django.conf import settings def highlight_with_class(text, klass):
from diff_match_patch import diff_match_patch
import platform
import logging
# TODO: handle changes in link hrefs
def highlight_with_class(klass, text):
# type: (Text, Text) -> Text # type: (Text, Text) -> Text
return '<span class="%s">%s</span>' % (klass, text) return '<span class="%s">%s</span>' % (klass, text)
def highlight_inserted(text):
# type: (Text) -> Text
return highlight_with_class('highlight_text_inserted', text)
def highlight_deleted(text):
# type: (Text) -> Text
return highlight_with_class('highlight_text_deleted', text)
def chunkize(text, in_tag):
# type: (Text, bool) -> Tuple[List[Tuple[Text, Text]], bool]
start = 0
idx = 0
chunks = [] # type: List[Tuple[Text, Text]]
for c in text:
if c == '<':
in_tag = True
if start != idx:
chunks.append(('text', text[start:idx]))
start = idx
elif c == '>':
in_tag = False
if start != idx + 1:
chunks.append(('tag', text[start:idx + 1]))
start = idx + 1
idx += 1
if start != idx:
chunks.append(('tag' if in_tag else 'text', text[start:idx]))
return chunks, in_tag
def highlight_chunks(chunks, highlight_func):
# type: (List[Tuple[Text, Text]], Callable[[Text], Text]) -> Text
retval = u''
for type, text in chunks:
if type == 'text':
retval += highlight_func(text)
else:
retval += text
return retval
def verify_html(html):
# type: (Text) -> bool
# TODO: Actually parse the resulting HTML to ensure we don't
# create mal-formed markup. This is unfortunately hard because
# we both want pretty strict parsing and we want to parse html5
# fragments. For now, we do a basic sanity check.
in_tag = False
for c in html:
if c == '<':
if in_tag:
return False
in_tag = True
elif c == '>':
if not in_tag:
return False
in_tag = False
if in_tag:
return False
return True
def check_tags(text):
# type: (Text) -> Text
# The current diffing algorithm produces malformed html when text is
# added to existing new lines. This patch manually corrects that.
in_tag = False
if text.endswith('<'):
text = text[:-1]
for c in text:
if c == '<':
in_tag = True
elif c == '>' and not in_tag:
text = '<' + text
break
return text
def highlight_html_differences(s1, s2, msg_id=None): def highlight_html_differences(s1, s2, msg_id=None):
# type: (Text, Text, Optional[int]) -> Text # type: (Text, Text, Optional[int]) -> Text
differ = diff_match_patch() retval = htmldiff(s1, s2)
ops = differ.diff_main(s1, s2) fragment = lxml.html.fromstring(retval) # type: ignore # https://github.com/python/typeshed/issues/525
differ.diff_cleanupSemantic(ops)
retval = u''
in_tag = False
idx = 0 for elem in fragment.cssselect('del'):
while idx < len(ops): elem.tag = 'span'
op, text = ops[idx] elem.set('class', 'highlight_text_deleted')
text = check_tags(text)
if idx != 0:
prev_op, prev_text = ops[idx - 1]
prev_text = check_tags(prev_text)
# Remove visual offset from editing newlines
if '<p><br>' in text:
text = text.replace('<p><br>', '<p>')
elif prev_text.endswith('<p>') and text.startswith('<br>'):
text = text[4:]
if op == diff_match_patch.DIFF_DELETE:
chunks, in_tag = chunkize(text, in_tag)
retval += highlight_chunks(chunks, highlight_deleted)
elif op == diff_match_patch.DIFF_INSERT:
chunks, in_tag = chunkize(text, in_tag)
retval += highlight_chunks(chunks, highlight_inserted)
elif op == diff_match_patch.DIFF_EQUAL:
chunks, in_tag = chunkize(text, in_tag)
retval += text
idx += 1
if not verify_html(retval): for elem in fragment.cssselect('ins'):
from zerver.lib.actions import internal_send_message elem.tag = 'span'
from zerver.models import get_system_bot elem.set('class', 'highlight_text_inserted')
# Normally, one would just throw a JsonableError, but because retval = lxml.html.tostring(fragment) # type: ignore # https://github.com/python/typeshed/issues/525
# we don't super trust this algorithm, it makes sense to
# mostly report the error to the Zulip developers to debug.
logging.getLogger('').error('HTML diff produced mal-formed HTML for message %s' % (msg_id,))
if settings.ERROR_BOT is not None:
subject = "HTML diff failure on %s" % (platform.node(),)
realm = get_system_bot(settings.ERROR_BOT).realm
internal_send_message(realm, settings.ERROR_BOT, "stream",
"errors", subject, "HTML diff produced malformed HTML for message %s" % (msg_id,))
return s2
return retval return retval

View File

@@ -1522,8 +1522,8 @@ class EditMessageTest(ZulipTestCase):
'<p>content after edit</p>') '<p>content after edit</p>')
self.assertEqual(message_history_1[1]['content_html_diff'], self.assertEqual(message_history_1[1]['content_html_diff'],
('<p>content ' ('<p>content '
'<span class="highlight_text_inserted">after</span> '
'<span class="highlight_text_deleted">before</span>' '<span class="highlight_text_deleted">before</span>'
'<span class="highlight_text_inserted">after</span>'
' edit</p>')) ' edit</p>'))
# Check content of message before edit. # Check content of message before edit.
self.assertEqual(message_history_1[1]['prev_rendered_content'], self.assertEqual(message_history_1[1]['prev_rendered_content'],
@@ -1559,16 +1559,48 @@ class EditMessageTest(ZulipTestCase):
'content after edit, line 2<br>\n' 'content after edit, line 2<br>\n'
'content before edit, line 3</p>')) 'content before edit, line 3</p>'))
self.assertEqual(message_history_2[1]['content_html_diff'], self.assertEqual(message_history_2[1]['content_html_diff'],
('<p>content before edit, line 1</p>' ('<p>content before edit, line 1<br> '
'<span class="highlight_text_deleted">\n' 'content <span class="highlight_text_inserted">after edit, line 2<br> '
'</span><p><span class="highlight_text_inserted">\n' 'content</span> before edit, line 3</p>'))
'content after edit, line 2</span><br>'
'<span class="highlight_text_inserted">\n'
'</span>content before edit, line 3</p>'))
self.assertEqual(message_history_2[1]['prev_rendered_content'], self.assertEqual(message_history_2[1]['prev_rendered_content'],
('<p>content before edit, line 1</p>\n' ('<p>content before edit, line 1</p>\n'
'<p>content before edit, line 3</p>')) '<p>content before edit, line 3</p>'))
def test_edit_link(self):
# type: () -> None
# Link editing
self.login(self.example_email("hamlet"))
msg_id_1 = self.send_stream_message(
self.example_email("hamlet"),
"Scotland",
topic_name="editing",
content="Here is a link to [zulip](www.zulip.org).")
new_content_1 = 'Here is a link to [zulip](www.zulipchat.com).'
result_1 = self.client_patch("/json/messages/" + str(msg_id_1), {
'message_id': msg_id_1, 'content': new_content_1
})
self.assert_json_success(result_1)
message_edit_history_1 = self.client_get(
"/json/messages/" + str(msg_id_1) + "/history")
json_response_1 = ujson.loads(
message_edit_history_1.content.decode('utf-8'))
message_history_1 = json_response_1['message_history']
# Check content of message after edit.
self.assertEqual(message_history_1[0]['rendered_content'],
'<p>Here is a link to '
'<a href="http://www.zulip.org" target="_blank" title="http://www.zulip.org">zulip</a>.</p>')
self.assertEqual(message_history_1[1]['rendered_content'],
'<p>Here is a link to '
'<a href="http://www.zulipchat.com" target="_blank" title="http://www.zulipchat.com">zulip</a>.</p>')
self.assertEqual(message_history_1[1]['content_html_diff'],
('<p>Here is a link to <a href="http://www.zulipchat.com" '
'target="_blank" title="http://www.zulipchat.com">zulip '
'<span class="highlight_text_inserted"> Link: http://www.zulipchat.com .'
'</span> <span class="highlight_text_deleted"> Link: http://www.zulip.org .'
'</span> </a></p>'))
def test_user_info_for_updates(self): def test_user_info_for_updates(self):
# type: () -> None # type: () -> None
hamlet = self.example_user('hamlet') hamlet = self.example_user('hamlet')