mirror of
https://github.com/zulip/zulip.git
synced 2025-11-04 14:03:30 +00:00
html_diff: Migrate to use lxml.
We were using Google's diff-match-patch library to diff HTML. The problem with that approach is that it is a text differ, not an HTML differ and so it ends up messing up the HTML tags. `lxml` is a safer option. Fixes: #7219.
This commit is contained in:
committed by
Tim Abbott
parent
c160c06f9c
commit
c863bb83a0
@@ -1,138 +1,25 @@
|
|||||||
|
import lxml
|
||||||
|
|
||||||
from typing import Callable, List, Optional, Tuple, Text
|
from lxml.html.diff import htmldiff
|
||||||
|
from typing import Optional, Text
|
||||||
|
|
||||||
from django.conf import settings
|
def highlight_with_class(text, klass):
|
||||||
|
|
||||||
from diff_match_patch import diff_match_patch
|
|
||||||
import platform
|
|
||||||
import logging
|
|
||||||
|
|
||||||
# TODO: handle changes in link hrefs
|
|
||||||
|
|
||||||
def highlight_with_class(klass, text):
|
|
||||||
# type: (Text, Text) -> Text
|
# type: (Text, Text) -> Text
|
||||||
return '<span class="%s">%s</span>' % (klass, text)
|
return '<span class="%s">%s</span>' % (klass, text)
|
||||||
|
|
||||||
def highlight_inserted(text):
|
|
||||||
# type: (Text) -> Text
|
|
||||||
return highlight_with_class('highlight_text_inserted', text)
|
|
||||||
|
|
||||||
def highlight_deleted(text):
|
|
||||||
# type: (Text) -> Text
|
|
||||||
return highlight_with_class('highlight_text_deleted', text)
|
|
||||||
|
|
||||||
def chunkize(text, in_tag):
|
|
||||||
# type: (Text, bool) -> Tuple[List[Tuple[Text, Text]], bool]
|
|
||||||
start = 0
|
|
||||||
idx = 0
|
|
||||||
chunks = [] # type: List[Tuple[Text, Text]]
|
|
||||||
for c in text:
|
|
||||||
if c == '<':
|
|
||||||
in_tag = True
|
|
||||||
if start != idx:
|
|
||||||
chunks.append(('text', text[start:idx]))
|
|
||||||
start = idx
|
|
||||||
elif c == '>':
|
|
||||||
in_tag = False
|
|
||||||
if start != idx + 1:
|
|
||||||
chunks.append(('tag', text[start:idx + 1]))
|
|
||||||
start = idx + 1
|
|
||||||
idx += 1
|
|
||||||
|
|
||||||
if start != idx:
|
|
||||||
chunks.append(('tag' if in_tag else 'text', text[start:idx]))
|
|
||||||
return chunks, in_tag
|
|
||||||
|
|
||||||
def highlight_chunks(chunks, highlight_func):
|
|
||||||
# type: (List[Tuple[Text, Text]], Callable[[Text], Text]) -> Text
|
|
||||||
retval = u''
|
|
||||||
for type, text in chunks:
|
|
||||||
if type == 'text':
|
|
||||||
retval += highlight_func(text)
|
|
||||||
else:
|
|
||||||
retval += text
|
|
||||||
return retval
|
|
||||||
|
|
||||||
def verify_html(html):
|
|
||||||
# type: (Text) -> bool
|
|
||||||
# TODO: Actually parse the resulting HTML to ensure we don't
|
|
||||||
# create mal-formed markup. This is unfortunately hard because
|
|
||||||
# we both want pretty strict parsing and we want to parse html5
|
|
||||||
# fragments. For now, we do a basic sanity check.
|
|
||||||
in_tag = False
|
|
||||||
for c in html:
|
|
||||||
if c == '<':
|
|
||||||
if in_tag:
|
|
||||||
return False
|
|
||||||
in_tag = True
|
|
||||||
elif c == '>':
|
|
||||||
if not in_tag:
|
|
||||||
return False
|
|
||||||
in_tag = False
|
|
||||||
if in_tag:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def check_tags(text):
|
|
||||||
# type: (Text) -> Text
|
|
||||||
# The current diffing algorithm produces malformed html when text is
|
|
||||||
# added to existing new lines. This patch manually corrects that.
|
|
||||||
in_tag = False
|
|
||||||
if text.endswith('<'):
|
|
||||||
text = text[:-1]
|
|
||||||
for c in text:
|
|
||||||
if c == '<':
|
|
||||||
in_tag = True
|
|
||||||
elif c == '>' and not in_tag:
|
|
||||||
text = '<' + text
|
|
||||||
break
|
|
||||||
return text
|
|
||||||
|
|
||||||
def highlight_html_differences(s1, s2, msg_id=None):
|
def highlight_html_differences(s1, s2, msg_id=None):
|
||||||
# type: (Text, Text, Optional[int]) -> Text
|
# type: (Text, Text, Optional[int]) -> Text
|
||||||
differ = diff_match_patch()
|
retval = htmldiff(s1, s2)
|
||||||
ops = differ.diff_main(s1, s2)
|
fragment = lxml.html.fromstring(retval) # type: ignore # https://github.com/python/typeshed/issues/525
|
||||||
differ.diff_cleanupSemantic(ops)
|
|
||||||
retval = u''
|
|
||||||
in_tag = False
|
|
||||||
|
|
||||||
idx = 0
|
for elem in fragment.cssselect('del'):
|
||||||
while idx < len(ops):
|
elem.tag = 'span'
|
||||||
op, text = ops[idx]
|
elem.set('class', 'highlight_text_deleted')
|
||||||
text = check_tags(text)
|
|
||||||
if idx != 0:
|
|
||||||
prev_op, prev_text = ops[idx - 1]
|
|
||||||
prev_text = check_tags(prev_text)
|
|
||||||
# Remove visual offset from editing newlines
|
|
||||||
if '<p><br>' in text:
|
|
||||||
text = text.replace('<p><br>', '<p>')
|
|
||||||
elif prev_text.endswith('<p>') and text.startswith('<br>'):
|
|
||||||
text = text[4:]
|
|
||||||
if op == diff_match_patch.DIFF_DELETE:
|
|
||||||
chunks, in_tag = chunkize(text, in_tag)
|
|
||||||
retval += highlight_chunks(chunks, highlight_deleted)
|
|
||||||
elif op == diff_match_patch.DIFF_INSERT:
|
|
||||||
chunks, in_tag = chunkize(text, in_tag)
|
|
||||||
retval += highlight_chunks(chunks, highlight_inserted)
|
|
||||||
elif op == diff_match_patch.DIFF_EQUAL:
|
|
||||||
chunks, in_tag = chunkize(text, in_tag)
|
|
||||||
retval += text
|
|
||||||
idx += 1
|
|
||||||
|
|
||||||
if not verify_html(retval):
|
for elem in fragment.cssselect('ins'):
|
||||||
from zerver.lib.actions import internal_send_message
|
elem.tag = 'span'
|
||||||
from zerver.models import get_system_bot
|
elem.set('class', 'highlight_text_inserted')
|
||||||
|
|
||||||
# Normally, one would just throw a JsonableError, but because
|
retval = lxml.html.tostring(fragment) # type: ignore # https://github.com/python/typeshed/issues/525
|
||||||
# we don't super trust this algorithm, it makes sense to
|
|
||||||
# mostly report the error to the Zulip developers to debug.
|
|
||||||
logging.getLogger('').error('HTML diff produced mal-formed HTML for message %s' % (msg_id,))
|
|
||||||
|
|
||||||
if settings.ERROR_BOT is not None:
|
|
||||||
subject = "HTML diff failure on %s" % (platform.node(),)
|
|
||||||
realm = get_system_bot(settings.ERROR_BOT).realm
|
|
||||||
internal_send_message(realm, settings.ERROR_BOT, "stream",
|
|
||||||
"errors", subject, "HTML diff produced malformed HTML for message %s" % (msg_id,))
|
|
||||||
return s2
|
|
||||||
|
|
||||||
return retval
|
return retval
|
||||||
|
|||||||
@@ -1522,8 +1522,8 @@ class EditMessageTest(ZulipTestCase):
|
|||||||
'<p>content after edit</p>')
|
'<p>content after edit</p>')
|
||||||
self.assertEqual(message_history_1[1]['content_html_diff'],
|
self.assertEqual(message_history_1[1]['content_html_diff'],
|
||||||
('<p>content '
|
('<p>content '
|
||||||
|
'<span class="highlight_text_inserted">after</span> '
|
||||||
'<span class="highlight_text_deleted">before</span>'
|
'<span class="highlight_text_deleted">before</span>'
|
||||||
'<span class="highlight_text_inserted">after</span>'
|
|
||||||
' edit</p>'))
|
' edit</p>'))
|
||||||
# Check content of message before edit.
|
# Check content of message before edit.
|
||||||
self.assertEqual(message_history_1[1]['prev_rendered_content'],
|
self.assertEqual(message_history_1[1]['prev_rendered_content'],
|
||||||
@@ -1559,16 +1559,48 @@ class EditMessageTest(ZulipTestCase):
|
|||||||
'content after edit, line 2<br>\n'
|
'content after edit, line 2<br>\n'
|
||||||
'content before edit, line 3</p>'))
|
'content before edit, line 3</p>'))
|
||||||
self.assertEqual(message_history_2[1]['content_html_diff'],
|
self.assertEqual(message_history_2[1]['content_html_diff'],
|
||||||
('<p>content before edit, line 1</p>'
|
('<p>content before edit, line 1<br> '
|
||||||
'<span class="highlight_text_deleted">\n'
|
'content <span class="highlight_text_inserted">after edit, line 2<br> '
|
||||||
'</span><p><span class="highlight_text_inserted">\n'
|
'content</span> before edit, line 3</p>'))
|
||||||
'content after edit, line 2</span><br>'
|
|
||||||
'<span class="highlight_text_inserted">\n'
|
|
||||||
'</span>content before edit, line 3</p>'))
|
|
||||||
self.assertEqual(message_history_2[1]['prev_rendered_content'],
|
self.assertEqual(message_history_2[1]['prev_rendered_content'],
|
||||||
('<p>content before edit, line 1</p>\n'
|
('<p>content before edit, line 1</p>\n'
|
||||||
'<p>content before edit, line 3</p>'))
|
'<p>content before edit, line 3</p>'))
|
||||||
|
|
||||||
|
def test_edit_link(self):
|
||||||
|
# type: () -> None
|
||||||
|
# Link editing
|
||||||
|
self.login(self.example_email("hamlet"))
|
||||||
|
msg_id_1 = self.send_stream_message(
|
||||||
|
self.example_email("hamlet"),
|
||||||
|
"Scotland",
|
||||||
|
topic_name="editing",
|
||||||
|
content="Here is a link to [zulip](www.zulip.org).")
|
||||||
|
new_content_1 = 'Here is a link to [zulip](www.zulipchat.com).'
|
||||||
|
result_1 = self.client_patch("/json/messages/" + str(msg_id_1), {
|
||||||
|
'message_id': msg_id_1, 'content': new_content_1
|
||||||
|
})
|
||||||
|
self.assert_json_success(result_1)
|
||||||
|
|
||||||
|
message_edit_history_1 = self.client_get(
|
||||||
|
"/json/messages/" + str(msg_id_1) + "/history")
|
||||||
|
json_response_1 = ujson.loads(
|
||||||
|
message_edit_history_1.content.decode('utf-8'))
|
||||||
|
message_history_1 = json_response_1['message_history']
|
||||||
|
|
||||||
|
# Check content of message after edit.
|
||||||
|
self.assertEqual(message_history_1[0]['rendered_content'],
|
||||||
|
'<p>Here is a link to '
|
||||||
|
'<a href="http://www.zulip.org" target="_blank" title="http://www.zulip.org">zulip</a>.</p>')
|
||||||
|
self.assertEqual(message_history_1[1]['rendered_content'],
|
||||||
|
'<p>Here is a link to '
|
||||||
|
'<a href="http://www.zulipchat.com" target="_blank" title="http://www.zulipchat.com">zulip</a>.</p>')
|
||||||
|
self.assertEqual(message_history_1[1]['content_html_diff'],
|
||||||
|
('<p>Here is a link to <a href="http://www.zulipchat.com" '
|
||||||
|
'target="_blank" title="http://www.zulipchat.com">zulip '
|
||||||
|
'<span class="highlight_text_inserted"> Link: http://www.zulipchat.com .'
|
||||||
|
'</span> <span class="highlight_text_deleted"> Link: http://www.zulip.org .'
|
||||||
|
'</span> </a></p>'))
|
||||||
|
|
||||||
def test_user_info_for_updates(self):
|
def test_user_info_for_updates(self):
|
||||||
# type: () -> None
|
# type: () -> None
|
||||||
hamlet = self.example_user('hamlet')
|
hamlet = self.example_user('hamlet')
|
||||||
|
|||||||
Reference in New Issue
Block a user