mattermost_import: Except error when converting messages HTML.

This adds a try-except block when running html2text when processing raw
messages from HTML to markdown.

convert_html_to_text is added mainly for testing convinience. We don't
have any sample of Mattermosts' problematic content that could trigger
this sort of error yet, so the test mocks convert_html_to_text to raise
error instead.

(cherry picked from commit 201a71b575)
This commit is contained in:
PieterCK
2025-05-21 09:48:05 +07:00
committed by Tim Abbott
parent a5ee0e913e
commit 6b9365f616
2 changed files with 36 additions and 2 deletions

View File

@@ -393,6 +393,11 @@ def process_message_attachments(
return content, has_image
def convert_html_to_text(content: str) -> str:
# html2text is GPL licensed, so run it as a subprocess.
return subprocess.check_output(["html2text", "--unicode-snob"], input=content, text=True)
def process_raw_message_batch(
realm_id: int,
raw_messages: list[dict[str, Any]],
@@ -439,8 +444,11 @@ def process_raw_message_batch(
mention_user_ids=mention_user_ids,
)
# html2text is GPL licensed, so run it as a subprocess.
content = subprocess.check_output(["html2text", "--unicode-snob"], input=content, text=True)
try:
content = convert_html_to_text(content)
except Exception:
logging.warning("Error converting HTML to text for message: '%s'; continuing", content)
logging.warning(str(raw_message))
date_sent = raw_message["date_sent"]
sender_user_id = raw_message["sender_id"]

View File

@@ -1,5 +1,6 @@
import filecmp
import os
import subprocess
from typing import Any
from unittest.mock import call, patch
@@ -981,3 +982,28 @@ class MatterMostImporter(ZulipTestCase):
self.assertIsNotNone(message.rendered_content)
self.verify_emoji_code_foreign_keys()
def test_fail_process_raw_message_batch(self) -> None:
# TODO: Once we have a sample of message content that can trigger this error
# we should add that as fixture instead of mocking `convert_html_to_text`.
mattermost_data_dir = self.fixture_file_name("", "mattermost_fixtures")
output_dir = self.make_import_output_dir("mattermost")
with (
patch("builtins.print"),
patch("zerver.data_import.mattermost.convert_html_to_text") as mock_html2text,
self.assertLogs(level="WARNING") as warn_log,
):
mock_html2text.side_effect = subprocess.CalledProcessError(
returncode=1, cmd="html2text", output="mocked failure"
)
do_convert_data(
mattermost_data_dir=mattermost_data_dir,
output_dir=output_dir,
masking_content=True,
)
mock_html2text.assert_called()
self.assertIn(
"WARNING:root:Error converting HTML to text for message: 'Xxxxxxx!'; continuing",
warn_log.output,
)