diff --git a/zerver/data_import/mattermost.py b/zerver/data_import/mattermost.py index 31d4cdc51d..4aab1ddb53 100644 --- a/zerver/data_import/mattermost.py +++ b/zerver/data_import/mattermost.py @@ -393,6 +393,11 @@ def process_message_attachments( return content, has_image +def convert_html_to_text(content: str) -> str: + # html2text is GPL licensed, so run it as a subprocess. + return subprocess.check_output(["html2text", "--unicode-snob"], input=content, text=True) + + def process_raw_message_batch( realm_id: int, raw_messages: list[dict[str, Any]], @@ -439,8 +444,11 @@ def process_raw_message_batch( mention_user_ids=mention_user_ids, ) - # html2text is GPL licensed, so run it as a subprocess. - content = subprocess.check_output(["html2text", "--unicode-snob"], input=content, text=True) + try: + content = convert_html_to_text(content) + except Exception: + logging.warning("Error converting HTML to text for message: '%s'; continuing", content) + logging.warning(str(raw_message)) date_sent = raw_message["date_sent"] sender_user_id = raw_message["sender_id"] diff --git a/zerver/tests/test_mattermost_importer.py b/zerver/tests/test_mattermost_importer.py index 778ad98ff1..dc70a92a3a 100644 --- a/zerver/tests/test_mattermost_importer.py +++ b/zerver/tests/test_mattermost_importer.py @@ -1,5 +1,6 @@ import filecmp import os +import subprocess from typing import Any from unittest.mock import call, patch @@ -981,3 +982,28 @@ class MatterMostImporter(ZulipTestCase): self.assertIsNotNone(message.rendered_content) self.verify_emoji_code_foreign_keys() + + def test_fail_process_raw_message_batch(self) -> None: + # TODO: Once we have a sample of message content that can trigger this error + # we should add that as fixture instead of mocking `convert_html_to_text`. + mattermost_data_dir = self.fixture_file_name("", "mattermost_fixtures") + output_dir = self.make_import_output_dir("mattermost") + + with ( + patch("builtins.print"), + patch("zerver.data_import.mattermost.convert_html_to_text") as mock_html2text, + self.assertLogs(level="WARNING") as warn_log, + ): + mock_html2text.side_effect = subprocess.CalledProcessError( + returncode=1, cmd="html2text", output="mocked failure" + ) + do_convert_data( + mattermost_data_dir=mattermost_data_dir, + output_dir=output_dir, + masking_content=True, + ) + mock_html2text.assert_called() + self.assertIn( + "WARNING:root:Error converting HTML to text for message: 'Xxxxxxx!'; continuing", + warn_log.output, + )