email_gateway: Use html2text directly.

In the rare case that Zulip receives an email with only an HTML
format, we originally (code dating to 2013) shelled out to
html2markdown/python-html2text in order to convert the HTML into
markdown.

We long since added html2text as a reasonably managed Python
dependency of Zulip; we should just use it here.
This commit is contained in:
Tim Abbott
2019-07-23 17:49:16 -07:00
parent 2ac944c31f
commit daca742e9f
2 changed files with 4 additions and 17 deletions

View File

@@ -27,10 +27,10 @@ from zerver.models import (
from datetime import timedelta from datetime import timedelta
from email.utils import formataddr from email.utils import formataddr
import html2text
from lxml.cssselect import CSSSelector from lxml.cssselect import CSSSelector
import lxml.html import lxml.html
import re import re
import subprocess
from collections import defaultdict from collections import defaultdict
import pytz import pytz
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -580,22 +580,9 @@ def enqueue_welcome_emails(user: UserProfile, realm_creation: bool=False) -> Non
from_address=from_address, context=context, delay=followup_day2_email_delay(user)) from_address=from_address, context=context, delay=followup_day2_email_delay(user))
def convert_html_to_markdown(html: str) -> str: def convert_html_to_markdown(html: str) -> str:
# On Linux, the tool installs as html2markdown, and there's a command called parser = html2text.HTML2Text()
# html2text that does something totally different. On OSX, the tool installs markdown = parser.handle(html).strip()
# as html2text.
commands = ["html2markdown", "html2text"]
for command in commands:
try:
# A body width of 0 means do not try to wrap the text for us.
p = subprocess.Popen(
[command, "--body-width=0"], stdout=subprocess.PIPE,
stdin=subprocess.PIPE, stderr=subprocess.STDOUT)
break
except OSError:
continue
markdown = p.communicate(input=html.encode('utf-8'))[0].decode('utf-8').strip()
# We want images to get linked and inline previewed, but html2text will turn # We want images to get linked and inline previewed, but html2text will turn
# them into links of the form `![](http://foo.com/image.png)`, which is # them into links of the form `![](http://foo.com/image.png)`, which is
# ugly. Run a regex over the resulting description, turning links of the # ugly. Run a regex over the resulting description, turning links of the

View File

@@ -114,7 +114,7 @@ Requester Bob <requester-bob@example.com> added a {} note to \
""" """
expected_topic = u"#12: Not enough ☃ guinea pigs" expected_topic = u"#12: Not enough ☃ guinea pigs"
expected_message = """ expected_message = """
Requester \u2603 Bob <requester-bob@example.com> created [ticket #12](http://test1234zzz.freshdesk.com/helpdesk/tickets/12):\n\n``` quote\nThere are too many cat pictures on the internet \u2603. We need more guinea pigs. Exhibit 1:\n\n \n\n\n[guinea_pig.png](http://cdn.freshdesk.com/data/helpdesk/attachments/production/12744808/original/guinea_pig.png)\n```\n\n* **Type**: Problem\n* **Priority**: Urgent\n* **Status**: Open Requester \u2603 Bob <requester-bob@example.com> created [ticket #12](http://test1234zzz.freshdesk.com/helpdesk/tickets/12):\n\n``` quote\nThere are too many cat pictures on the internet \u2603. We need more guinea pigs.\nExhibit 1:\n\n \n\n[guinea_pig.png](http://cdn.freshdesk.com/data/helpdesk/attachments/production/12744808/original/guinea_pig.png)\n```\n\n* **Type**: Problem\n* **Priority**: Urgent\n* **Status**: Open
""".strip() """.strip()
self.api_stream_message(self.TEST_USER_EMAIL, "inline_images", expected_topic, expected_message, self.api_stream_message(self.TEST_USER_EMAIL, "inline_images", expected_topic, expected_message,
content_type="application/x-www-form-urlencoded") content_type="application/x-www-form-urlencoded")