email_mirror: Don't remove quotations from forwarded messages.

Addresses point 2 of #10612. We use a regex to detect if a form
of FWD indicator is present at the beginning of the subject, which
means the message has been forwarded.
remove_quotations argument is added to a couple of functions where
it's necessary.
In filter_footer, the criteria for a line to be a possible beginning
of a footer is changed to line.strip() == "--", instead of
line.strip().startswith("--"), because the former would remove
quotations from plaintext emails. This change makes sense, because
RFC 3676 specifies ""-- " as the separator line between the body
and the signature of a message":
https://tools.ietf.org/html/rfc3676
This commit is contained in:
Mateusz Mandera
2019-03-09 22:35:45 +01:00
committed by Tim Abbott
parent 0633f268fb
commit edcb6d57fc
2 changed files with 48 additions and 8 deletions

View File

@@ -133,9 +133,9 @@ def mark_missed_message_address_as_used(address: str) -> None:
redis_client.delete(key) redis_client.delete(key)
raise ZulipEmailForwardError('Missed message address has already been used') raise ZulipEmailForwardError('Missed message address has already been used')
def construct_zulip_body(message: message.Message, realm: Realm, def construct_zulip_body(message: message.Message, realm: Realm, show_sender: bool=False,
show_sender: bool=False) -> str: remove_quotations: bool=True) -> str:
body = extract_body(message) body = extract_body(message, remove_quotations)
# Remove null characters, since Zulip will reject # Remove null characters, since Zulip will reject
body = body.replace("\x00", "") body = body.replace("\x00", "")
body = filter_footer(body) body = filter_footer(body)
@@ -227,7 +227,7 @@ def get_message_part_by_type(message: message.Message, content_type: str) -> Opt
return None return None
talon_initialized = False talon_initialized = False
def extract_body(message: message.Message) -> str: def extract_body(message: message.Message, remove_quotations: bool=True) -> str:
import talon import talon
global talon_initialized global talon_initialized
if not talon_initialized: if not talon_initialized:
@@ -238,12 +238,18 @@ def extract_body(message: message.Message) -> str:
# that. # that.
plaintext_content = get_message_part_by_type(message, "text/plain") plaintext_content = get_message_part_by_type(message, "text/plain")
if plaintext_content: if plaintext_content:
return talon.quotations.extract_from_plain(plaintext_content) if remove_quotations:
return talon.quotations.extract_from_plain(plaintext_content)
else:
return plaintext_content
# If we only have an HTML version, try to make that look nice. # If we only have an HTML version, try to make that look nice.
html_content = get_message_part_by_type(message, "text/html") html_content = get_message_part_by_type(message, "text/html")
if html_content: if html_content:
return convert_html_to_markdown(talon.quotations.extract_from_html(html_content)) if remove_quotations:
return convert_html_to_markdown(talon.quotations.extract_from_html(html_content))
else:
return convert_html_to_markdown(html_content)
if plaintext_content is not None or html_content is not None: if plaintext_content is not None or html_content is not None:
raise ZulipEmailForwardUserError("Email has no nonempty body sections; ignoring.") raise ZulipEmailForwardUserError("Email has no nonempty body sections; ignoring.")
@@ -253,7 +259,7 @@ def extract_body(message: message.Message) -> str:
def filter_footer(text: str) -> str: def filter_footer(text: str) -> str:
# Try to filter out obvious footers. # Try to filter out obvious footers.
possible_footers = [line for line in text.split("\n") if line.strip().startswith("--")] possible_footers = [line for line in text.split("\n") if line.strip() == "--"]
if len(possible_footers) != 1: if len(possible_footers) != 1:
# Be conservative and don't try to scrub content if there # Be conservative and don't try to scrub content if there
# isn't a trivial footer structure. # isn't a trivial footer structure.
@@ -326,13 +332,21 @@ def strip_from_subject(subject: str) -> str:
stripped = re.sub(reg, "", subject, flags = re.IGNORECASE | re.MULTILINE) stripped = re.sub(reg, "", subject, flags = re.IGNORECASE | re.MULTILINE)
return stripped.strip() return stripped.strip()
def is_forwarded(subject: str) -> bool:
# regex taken from strip_from_subject, we use it to detect various forms
# of FWD at the beginning of the subject.
reg = r"([\[\(] *)?\b(FWD?) *([-:;)\]][ :;\])-]*|$)|\]+ *$"
return bool(re.match(reg, subject, flags=re.IGNORECASE))
def process_stream_message(to: str, message: message.Message, def process_stream_message(to: str, message: message.Message,
debug_info: Dict[str, Any]) -> None: debug_info: Dict[str, Any]) -> None:
subject_header = str(make_header(decode_header(message.get("Subject", "")))) subject_header = str(make_header(decode_header(message.get("Subject", ""))))
subject = strip_from_subject(subject_header) or "(no topic)" subject = strip_from_subject(subject_header) or "(no topic)"
stream, show_sender = extract_and_validate(to) stream, show_sender = extract_and_validate(to)
body = construct_zulip_body(message, stream.realm, show_sender) # Don't remove quotations if message is forwarded:
remove_quotations = not is_forwarded(subject_header)
body = construct_zulip_body(message, stream.realm, show_sender, remove_quotations)
debug_info["stream"] = stream debug_info["stream"] = stream
send_zulip(settings.EMAIL_GATEWAY_BOT, stream, subject, body) send_zulip(settings.EMAIL_GATEWAY_BOT, stream, subject, body)
logger.info("Successfully processed email to %s (%s)" % ( logger.info("Successfully processed email to %s (%s)" % (

View File

@@ -30,8 +30,10 @@ from zerver.lib.email_mirror import (
create_missed_message_address, create_missed_message_address,
get_missed_message_token_from_address, get_missed_message_token_from_address,
strip_from_subject, strip_from_subject,
is_forwarded,
) )
from zerver.lib.notifications import convert_html_to_markdown
from zerver.lib.send_email import FromAddress from zerver.lib.send_email import FromAddress
from email.mime.text import MIMEText from email.mime.text import MIMEText
@@ -445,6 +447,17 @@ class TestEmptyGatewaySetting(ZulipTestCase):
self.assertEqual(test_address, '') self.assertEqual(test_address, '')
class TestReplyExtraction(ZulipTestCase): class TestReplyExtraction(ZulipTestCase):
def test_is_forwarded(self) -> None:
self.assertTrue(is_forwarded("FWD: hey"))
self.assertTrue(is_forwarded("fwd: hi"))
self.assertTrue(is_forwarded("[fwd] subject"))
self.assertTrue(is_forwarded("FWD: RE:"))
self.assertTrue(is_forwarded("Fwd: RE: fwd: re: subject"))
self.assertFalse(is_forwarded("subject"))
self.assertFalse(is_forwarded("RE: FWD: hi"))
def test_reply_is_extracted_from_plain(self) -> None: def test_reply_is_extracted_from_plain(self) -> None:
# build dummy messages for stream # build dummy messages for stream
@@ -476,6 +489,13 @@ class TestReplyExtraction(ZulipTestCase):
self.assertEqual(message.content, "Reply") self.assertEqual(message.content, "Reply")
# Don't extract if Subject indicates the email has been forwarded into the mirror:
del incoming_valid_message['Subject']
incoming_valid_message['Subject'] = 'FWD: TestStreamEmailMessages Subject'
process_message(incoming_valid_message)
message = most_recent_message(user_profile)
self.assertEqual(message.content, text)
def test_reply_is_extracted_from_html(self) -> None: def test_reply_is_extracted_from_html(self) -> None:
# build dummy messages for stream # build dummy messages for stream
@@ -520,6 +540,12 @@ class TestReplyExtraction(ZulipTestCase):
self.assertEqual(message.content, 'Reply') self.assertEqual(message.content, 'Reply')
# Don't extract if Subject indicates the email has been forwarded into the mirror:
del incoming_valid_message['Subject']
incoming_valid_message['Subject'] = 'FWD: TestStreamEmailMessages Subject'
process_message(incoming_valid_message)
message = most_recent_message(user_profile)
self.assertEqual(message.content, convert_html_to_markdown(html))
class TestScriptMTA(ZulipTestCase): class TestScriptMTA(ZulipTestCase):