email_gateway: Use html2text directly.

In the rare case that Zulip receives an email with only an HTML format, we originally (code dating to 2013) shelled out to html2markdown/python-html2text in order to convert the HTML into markdown. We long since added html2text as a reasonably managed Python dependency of Zulip; we should just use it here.
2025-11-02 13:03:29 +00:00 · 2019-07-23 17:49:16 -07:00
parent 2ac944c31f
commit daca742e9f
2 changed files with 4 additions and 17 deletions
--- a/zerver/lib/email_notifications.py
+++ b/zerver/lib/email_notifications.py
@@ -27,10 +27,10 @@ from zerver.models import (

 from datetime import timedelta
 from email.utils import formataddr
+import html2text
 from lxml.cssselect import CSSSelector
 import lxml.html
 import re
-import subprocess
 from collections import defaultdict
 import pytz
 from bs4 import BeautifulSoup
@@ -580,22 +580,9 @@ def enqueue_welcome_emails(user: UserProfile, realm_creation: bool=False) -> Non
            from_address=from_address, context=context, delay=followup_day2_email_delay(user))

 def convert_html_to_markdown(html: str) -> str:
-    # On Linux, the tool installs as html2markdown, and there's a command called
-    # html2text that does something totally different. On OSX, the tool installs
-    # as html2text.
-    commands = ["html2markdown", "html2text"]
+    parser = html2text.HTML2Text()
+    markdown = parser.handle(html).strip()

-    for command in commands:
-        try:
-            # A body width of 0 means do not try to wrap the text for us.
-            p = subprocess.Popen(
-                [command, "--body-width=0"], stdout=subprocess.PIPE,
-                stdin=subprocess.PIPE, stderr=subprocess.STDOUT)
-            break
-        except OSError:
-            continue
-
-    markdown = p.communicate(input=html.encode('utf-8'))[0].decode('utf-8').strip()
    # We want images to get linked and inline previewed, but html2text will turn
    # them into links of the form `![](http://foo.com/image.png)`, which is
    # ugly. Run a regex over the resulting description, turning links of the