mirror of
				https://github.com/zulip/zulip.git
				synced 2025-11-04 05:53:43 +00:00 
			
		
		
		
	html_diff: Migrate to use lxml.
We were using Google's diff-match-patch library to diff HTML. The problem with that approach is that it is a text differ, not an HTML differ and so it ends up messing up the HTML tags. `lxml` is a safer option. Fixes: #7219.
This commit is contained in:
		
				
					committed by
					
						
						Tim Abbott
					
				
			
			
				
	
			
			
			
						parent
						
							c160c06f9c
						
					
				
				
					commit
					c863bb83a0
				
			@@ -1,138 +1,25 @@
 | 
			
		||||
import lxml
 | 
			
		||||
 | 
			
		||||
from typing import Callable, List, Optional, Tuple, Text
 | 
			
		||||
from lxml.html.diff import htmldiff
 | 
			
		||||
from typing import Optional, Text
 | 
			
		||||
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
 | 
			
		||||
from diff_match_patch import diff_match_patch
 | 
			
		||||
import platform
 | 
			
		||||
import logging
 | 
			
		||||
 | 
			
		||||
# TODO: handle changes in link hrefs
 | 
			
		||||
 | 
			
		||||
def highlight_with_class(klass, text):
 | 
			
		||||
def highlight_with_class(text, klass):
 | 
			
		||||
    # type: (Text, Text) -> Text
 | 
			
		||||
    return '<span class="%s">%s</span>' % (klass, text)
 | 
			
		||||
 | 
			
		||||
def highlight_inserted(text):
 | 
			
		||||
    # type: (Text) -> Text
 | 
			
		||||
    return highlight_with_class('highlight_text_inserted', text)
 | 
			
		||||
 | 
			
		||||
def highlight_deleted(text):
 | 
			
		||||
    # type: (Text) -> Text
 | 
			
		||||
    return highlight_with_class('highlight_text_deleted', text)
 | 
			
		||||
 | 
			
		||||
def chunkize(text, in_tag):
 | 
			
		||||
    # type: (Text, bool) -> Tuple[List[Tuple[Text, Text]], bool]
 | 
			
		||||
    start = 0
 | 
			
		||||
    idx = 0
 | 
			
		||||
    chunks = []  # type: List[Tuple[Text, Text]]
 | 
			
		||||
    for c in text:
 | 
			
		||||
        if c == '<':
 | 
			
		||||
            in_tag = True
 | 
			
		||||
            if start != idx:
 | 
			
		||||
                chunks.append(('text', text[start:idx]))
 | 
			
		||||
            start = idx
 | 
			
		||||
        elif c == '>':
 | 
			
		||||
            in_tag = False
 | 
			
		||||
            if start != idx + 1:
 | 
			
		||||
                chunks.append(('tag', text[start:idx + 1]))
 | 
			
		||||
            start = idx + 1
 | 
			
		||||
        idx += 1
 | 
			
		||||
 | 
			
		||||
    if start != idx:
 | 
			
		||||
        chunks.append(('tag' if in_tag else 'text', text[start:idx]))
 | 
			
		||||
    return chunks, in_tag
 | 
			
		||||
 | 
			
		||||
def highlight_chunks(chunks, highlight_func):
 | 
			
		||||
    # type: (List[Tuple[Text, Text]], Callable[[Text], Text]) -> Text
 | 
			
		||||
    retval = u''
 | 
			
		||||
    for type, text in chunks:
 | 
			
		||||
        if type == 'text':
 | 
			
		||||
            retval += highlight_func(text)
 | 
			
		||||
        else:
 | 
			
		||||
            retval += text
 | 
			
		||||
    return retval
 | 
			
		||||
 | 
			
		||||
def verify_html(html):
 | 
			
		||||
    # type: (Text) -> bool
 | 
			
		||||
    # TODO: Actually parse the resulting HTML to ensure we don't
 | 
			
		||||
    # create mal-formed markup.  This is unfortunately hard because
 | 
			
		||||
    # we both want pretty strict parsing and we want to parse html5
 | 
			
		||||
    # fragments.  For now, we do a basic sanity check.
 | 
			
		||||
    in_tag = False
 | 
			
		||||
    for c in html:
 | 
			
		||||
        if c == '<':
 | 
			
		||||
            if in_tag:
 | 
			
		||||
                return False
 | 
			
		||||
            in_tag = True
 | 
			
		||||
        elif c == '>':
 | 
			
		||||
            if not in_tag:
 | 
			
		||||
                return False
 | 
			
		||||
            in_tag = False
 | 
			
		||||
    if in_tag:
 | 
			
		||||
        return False
 | 
			
		||||
    return True
 | 
			
		||||
 | 
			
		||||
def check_tags(text):
 | 
			
		||||
    # type: (Text) -> Text
 | 
			
		||||
    # The current diffing algorithm produces malformed html when text is
 | 
			
		||||
    # added to existing new lines. This patch manually corrects that.
 | 
			
		||||
    in_tag = False
 | 
			
		||||
    if text.endswith('<'):
 | 
			
		||||
        text = text[:-1]
 | 
			
		||||
    for c in text:
 | 
			
		||||
        if c == '<':
 | 
			
		||||
            in_tag = True
 | 
			
		||||
        elif c == '>' and not in_tag:
 | 
			
		||||
            text = '<' + text
 | 
			
		||||
            break
 | 
			
		||||
    return text
 | 
			
		||||
 | 
			
		||||
def highlight_html_differences(s1, s2, msg_id=None):
 | 
			
		||||
    # type: (Text, Text, Optional[int]) -> Text
 | 
			
		||||
    differ = diff_match_patch()
 | 
			
		||||
    ops = differ.diff_main(s1, s2)
 | 
			
		||||
    differ.diff_cleanupSemantic(ops)
 | 
			
		||||
    retval = u''
 | 
			
		||||
    in_tag = False
 | 
			
		||||
    retval = htmldiff(s1, s2)
 | 
			
		||||
    fragment = lxml.html.fromstring(retval)  # type: ignore # https://github.com/python/typeshed/issues/525
 | 
			
		||||
 | 
			
		||||
    idx = 0
 | 
			
		||||
    while idx < len(ops):
 | 
			
		||||
        op, text = ops[idx]
 | 
			
		||||
        text = check_tags(text)
 | 
			
		||||
        if idx != 0:
 | 
			
		||||
            prev_op, prev_text = ops[idx - 1]
 | 
			
		||||
            prev_text = check_tags(prev_text)
 | 
			
		||||
            # Remove visual offset from editing newlines
 | 
			
		||||
            if '<p><br>' in text:
 | 
			
		||||
                text = text.replace('<p><br>', '<p>')
 | 
			
		||||
            elif prev_text.endswith('<p>') and text.startswith('<br>'):
 | 
			
		||||
                text = text[4:]
 | 
			
		||||
        if op == diff_match_patch.DIFF_DELETE:
 | 
			
		||||
            chunks, in_tag = chunkize(text, in_tag)
 | 
			
		||||
            retval += highlight_chunks(chunks, highlight_deleted)
 | 
			
		||||
        elif op == diff_match_patch.DIFF_INSERT:
 | 
			
		||||
            chunks, in_tag = chunkize(text, in_tag)
 | 
			
		||||
            retval += highlight_chunks(chunks, highlight_inserted)
 | 
			
		||||
        elif op == diff_match_patch.DIFF_EQUAL:
 | 
			
		||||
            chunks, in_tag = chunkize(text, in_tag)
 | 
			
		||||
            retval += text
 | 
			
		||||
        idx += 1
 | 
			
		||||
    for elem in fragment.cssselect('del'):
 | 
			
		||||
        elem.tag = 'span'
 | 
			
		||||
        elem.set('class', 'highlight_text_deleted')
 | 
			
		||||
 | 
			
		||||
    if not verify_html(retval):
 | 
			
		||||
        from zerver.lib.actions import internal_send_message
 | 
			
		||||
        from zerver.models import get_system_bot
 | 
			
		||||
    for elem in fragment.cssselect('ins'):
 | 
			
		||||
        elem.tag = 'span'
 | 
			
		||||
        elem.set('class', 'highlight_text_inserted')
 | 
			
		||||
 | 
			
		||||
        # Normally, one would just throw a JsonableError, but because
 | 
			
		||||
        # we don't super trust this algorithm, it makes sense to
 | 
			
		||||
        # mostly report the error to the Zulip developers to debug.
 | 
			
		||||
        logging.getLogger('').error('HTML diff produced mal-formed HTML for message %s' % (msg_id,))
 | 
			
		||||
 | 
			
		||||
        if settings.ERROR_BOT is not None:
 | 
			
		||||
            subject = "HTML diff failure on %s" % (platform.node(),)
 | 
			
		||||
            realm = get_system_bot(settings.ERROR_BOT).realm
 | 
			
		||||
            internal_send_message(realm, settings.ERROR_BOT, "stream",
 | 
			
		||||
                                  "errors", subject, "HTML diff produced malformed HTML for message %s" % (msg_id,))
 | 
			
		||||
        return s2
 | 
			
		||||
    retval = lxml.html.tostring(fragment)   # type: ignore # https://github.com/python/typeshed/issues/525
 | 
			
		||||
 | 
			
		||||
    return retval
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user