mirror of
https://github.com/zulip/zulip.git
synced 2025-11-11 01:16:19 +00:00
html_diff: Migrate to use lxml.
We were using Google's diff-match-patch library to diff HTML. The problem with that approach is that it is a text differ, not an HTML differ and so it ends up messing up the HTML tags. `lxml` is a safer option. Fixes: #7219.
This commit is contained in:
committed by
Tim Abbott
parent
c160c06f9c
commit
c863bb83a0
@@ -1,138 +1,25 @@
|
||||
import lxml
|
||||
|
||||
from typing import Callable, List, Optional, Tuple, Text
|
||||
from lxml.html.diff import htmldiff
|
||||
from typing import Optional, Text
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from diff_match_patch import diff_match_patch
|
||||
import platform
|
||||
import logging
|
||||
|
||||
# TODO: handle changes in link hrefs
|
||||
|
||||
def highlight_with_class(klass, text):
|
||||
def highlight_with_class(text, klass):
|
||||
# type: (Text, Text) -> Text
|
||||
return '<span class="%s">%s</span>' % (klass, text)
|
||||
|
||||
def highlight_inserted(text):
|
||||
# type: (Text) -> Text
|
||||
return highlight_with_class('highlight_text_inserted', text)
|
||||
|
||||
def highlight_deleted(text):
|
||||
# type: (Text) -> Text
|
||||
return highlight_with_class('highlight_text_deleted', text)
|
||||
|
||||
def chunkize(text, in_tag):
|
||||
# type: (Text, bool) -> Tuple[List[Tuple[Text, Text]], bool]
|
||||
start = 0
|
||||
idx = 0
|
||||
chunks = [] # type: List[Tuple[Text, Text]]
|
||||
for c in text:
|
||||
if c == '<':
|
||||
in_tag = True
|
||||
if start != idx:
|
||||
chunks.append(('text', text[start:idx]))
|
||||
start = idx
|
||||
elif c == '>':
|
||||
in_tag = False
|
||||
if start != idx + 1:
|
||||
chunks.append(('tag', text[start:idx + 1]))
|
||||
start = idx + 1
|
||||
idx += 1
|
||||
|
||||
if start != idx:
|
||||
chunks.append(('tag' if in_tag else 'text', text[start:idx]))
|
||||
return chunks, in_tag
|
||||
|
||||
def highlight_chunks(chunks, highlight_func):
|
||||
# type: (List[Tuple[Text, Text]], Callable[[Text], Text]) -> Text
|
||||
retval = u''
|
||||
for type, text in chunks:
|
||||
if type == 'text':
|
||||
retval += highlight_func(text)
|
||||
else:
|
||||
retval += text
|
||||
return retval
|
||||
|
||||
def verify_html(html):
|
||||
# type: (Text) -> bool
|
||||
# TODO: Actually parse the resulting HTML to ensure we don't
|
||||
# create mal-formed markup. This is unfortunately hard because
|
||||
# we both want pretty strict parsing and we want to parse html5
|
||||
# fragments. For now, we do a basic sanity check.
|
||||
in_tag = False
|
||||
for c in html:
|
||||
if c == '<':
|
||||
if in_tag:
|
||||
return False
|
||||
in_tag = True
|
||||
elif c == '>':
|
||||
if not in_tag:
|
||||
return False
|
||||
in_tag = False
|
||||
if in_tag:
|
||||
return False
|
||||
return True
|
||||
|
||||
def check_tags(text):
|
||||
# type: (Text) -> Text
|
||||
# The current diffing algorithm produces malformed html when text is
|
||||
# added to existing new lines. This patch manually corrects that.
|
||||
in_tag = False
|
||||
if text.endswith('<'):
|
||||
text = text[:-1]
|
||||
for c in text:
|
||||
if c == '<':
|
||||
in_tag = True
|
||||
elif c == '>' and not in_tag:
|
||||
text = '<' + text
|
||||
break
|
||||
return text
|
||||
|
||||
def highlight_html_differences(s1, s2, msg_id=None):
|
||||
# type: (Text, Text, Optional[int]) -> Text
|
||||
differ = diff_match_patch()
|
||||
ops = differ.diff_main(s1, s2)
|
||||
differ.diff_cleanupSemantic(ops)
|
||||
retval = u''
|
||||
in_tag = False
|
||||
retval = htmldiff(s1, s2)
|
||||
fragment = lxml.html.fromstring(retval) # type: ignore # https://github.com/python/typeshed/issues/525
|
||||
|
||||
idx = 0
|
||||
while idx < len(ops):
|
||||
op, text = ops[idx]
|
||||
text = check_tags(text)
|
||||
if idx != 0:
|
||||
prev_op, prev_text = ops[idx - 1]
|
||||
prev_text = check_tags(prev_text)
|
||||
# Remove visual offset from editing newlines
|
||||
if '<p><br>' in text:
|
||||
text = text.replace('<p><br>', '<p>')
|
||||
elif prev_text.endswith('<p>') and text.startswith('<br>'):
|
||||
text = text[4:]
|
||||
if op == diff_match_patch.DIFF_DELETE:
|
||||
chunks, in_tag = chunkize(text, in_tag)
|
||||
retval += highlight_chunks(chunks, highlight_deleted)
|
||||
elif op == diff_match_patch.DIFF_INSERT:
|
||||
chunks, in_tag = chunkize(text, in_tag)
|
||||
retval += highlight_chunks(chunks, highlight_inserted)
|
||||
elif op == diff_match_patch.DIFF_EQUAL:
|
||||
chunks, in_tag = chunkize(text, in_tag)
|
||||
retval += text
|
||||
idx += 1
|
||||
for elem in fragment.cssselect('del'):
|
||||
elem.tag = 'span'
|
||||
elem.set('class', 'highlight_text_deleted')
|
||||
|
||||
if not verify_html(retval):
|
||||
from zerver.lib.actions import internal_send_message
|
||||
from zerver.models import get_system_bot
|
||||
for elem in fragment.cssselect('ins'):
|
||||
elem.tag = 'span'
|
||||
elem.set('class', 'highlight_text_inserted')
|
||||
|
||||
# Normally, one would just throw a JsonableError, but because
|
||||
# we don't super trust this algorithm, it makes sense to
|
||||
# mostly report the error to the Zulip developers to debug.
|
||||
logging.getLogger('').error('HTML diff produced mal-formed HTML for message %s' % (msg_id,))
|
||||
|
||||
if settings.ERROR_BOT is not None:
|
||||
subject = "HTML diff failure on %s" % (platform.node(),)
|
||||
realm = get_system_bot(settings.ERROR_BOT).realm
|
||||
internal_send_message(realm, settings.ERROR_BOT, "stream",
|
||||
"errors", subject, "HTML diff produced malformed HTML for message %s" % (msg_id,))
|
||||
return s2
|
||||
retval = lxml.html.tostring(fragment) # type: ignore # https://github.com/python/typeshed/issues/525
|
||||
|
||||
return retval
|
||||
|
||||
@@ -1522,8 +1522,8 @@ class EditMessageTest(ZulipTestCase):
|
||||
'<p>content after edit</p>')
|
||||
self.assertEqual(message_history_1[1]['content_html_diff'],
|
||||
('<p>content '
|
||||
'<span class="highlight_text_inserted">after</span> '
|
||||
'<span class="highlight_text_deleted">before</span>'
|
||||
'<span class="highlight_text_inserted">after</span>'
|
||||
' edit</p>'))
|
||||
# Check content of message before edit.
|
||||
self.assertEqual(message_history_1[1]['prev_rendered_content'],
|
||||
@@ -1559,16 +1559,48 @@ class EditMessageTest(ZulipTestCase):
|
||||
'content after edit, line 2<br>\n'
|
||||
'content before edit, line 3</p>'))
|
||||
self.assertEqual(message_history_2[1]['content_html_diff'],
|
||||
('<p>content before edit, line 1</p>'
|
||||
'<span class="highlight_text_deleted">\n'
|
||||
'</span><p><span class="highlight_text_inserted">\n'
|
||||
'content after edit, line 2</span><br>'
|
||||
'<span class="highlight_text_inserted">\n'
|
||||
'</span>content before edit, line 3</p>'))
|
||||
('<p>content before edit, line 1<br> '
|
||||
'content <span class="highlight_text_inserted">after edit, line 2<br> '
|
||||
'content</span> before edit, line 3</p>'))
|
||||
self.assertEqual(message_history_2[1]['prev_rendered_content'],
|
||||
('<p>content before edit, line 1</p>\n'
|
||||
'<p>content before edit, line 3</p>'))
|
||||
|
||||
def test_edit_link(self):
|
||||
# type: () -> None
|
||||
# Link editing
|
||||
self.login(self.example_email("hamlet"))
|
||||
msg_id_1 = self.send_stream_message(
|
||||
self.example_email("hamlet"),
|
||||
"Scotland",
|
||||
topic_name="editing",
|
||||
content="Here is a link to [zulip](www.zulip.org).")
|
||||
new_content_1 = 'Here is a link to [zulip](www.zulipchat.com).'
|
||||
result_1 = self.client_patch("/json/messages/" + str(msg_id_1), {
|
||||
'message_id': msg_id_1, 'content': new_content_1
|
||||
})
|
||||
self.assert_json_success(result_1)
|
||||
|
||||
message_edit_history_1 = self.client_get(
|
||||
"/json/messages/" + str(msg_id_1) + "/history")
|
||||
json_response_1 = ujson.loads(
|
||||
message_edit_history_1.content.decode('utf-8'))
|
||||
message_history_1 = json_response_1['message_history']
|
||||
|
||||
# Check content of message after edit.
|
||||
self.assertEqual(message_history_1[0]['rendered_content'],
|
||||
'<p>Here is a link to '
|
||||
'<a href="http://www.zulip.org" target="_blank" title="http://www.zulip.org">zulip</a>.</p>')
|
||||
self.assertEqual(message_history_1[1]['rendered_content'],
|
||||
'<p>Here is a link to '
|
||||
'<a href="http://www.zulipchat.com" target="_blank" title="http://www.zulipchat.com">zulip</a>.</p>')
|
||||
self.assertEqual(message_history_1[1]['content_html_diff'],
|
||||
('<p>Here is a link to <a href="http://www.zulipchat.com" '
|
||||
'target="_blank" title="http://www.zulipchat.com">zulip '
|
||||
'<span class="highlight_text_inserted"> Link: http://www.zulipchat.com .'
|
||||
'</span> <span class="highlight_text_deleted"> Link: http://www.zulip.org .'
|
||||
'</span> </a></p>'))
|
||||
|
||||
def test_user_info_for_updates(self):
|
||||
# type: () -> None
|
||||
hamlet = self.example_user('hamlet')
|
||||
|
||||
Reference in New Issue
Block a user