mirror of
https://github.com/zulip/zulip.git
synced 2025-11-09 16:37:23 +00:00
This uses the unauthed v1 of the Twitter API, which is going to go away soon, but it's fine as an interim measure. (imported from commit 709a250271321f5479854a363875c9da43e6382d)
420 lines
16 KiB
Python
420 lines
16 KiB
Python
import markdown
|
|
import logging
|
|
import traceback
|
|
import urlparse
|
|
import re
|
|
import os.path
|
|
import glob
|
|
import urllib2
|
|
import simplejson
|
|
|
|
from django.core import mail
|
|
from django.conf import settings
|
|
|
|
from zephyr.lib.avatar import gravatar_hash
|
|
from zephyr.lib.bugdown import codehilite, fenced_code
|
|
from zephyr.lib.bugdown.fenced_code import FENCE_RE
|
|
from zephyr.lib.timeout import timeout
|
|
|
|
def walk_tree(root, processor):
|
|
results = []
|
|
stack = [root]
|
|
|
|
while stack:
|
|
currElement = stack.pop()
|
|
for child in currElement.getchildren():
|
|
if child.getchildren():
|
|
stack.append(child)
|
|
|
|
result = processor(child)
|
|
if result is not None:
|
|
results.append(result)
|
|
|
|
return results
|
|
|
|
class InlineImagePreviewProcessor(markdown.treeprocessors.Treeprocessor):
|
|
def is_image(self, url):
|
|
parsed_url = urlparse.urlparse(url)
|
|
# List from http://support.google.com/chromeos/bin/answer.py?hl=en&answer=183093
|
|
for ext in [".bmp", ".gif", ".jpg", "jpeg", ".png", ".webp"]:
|
|
if parsed_url.path.lower().endswith(ext):
|
|
return True
|
|
return False
|
|
|
|
def dropbox_image(self, url):
|
|
if not self.is_image(url):
|
|
return None
|
|
parsed_url = urlparse.urlparse(url)
|
|
if (parsed_url.netloc == 'dropbox.com' or parsed_url.netloc.endswith('.dropbox.com')) \
|
|
and parsed_url.path.startswith('/s/'):
|
|
return "%s?dl=1" % (url,)
|
|
return None
|
|
|
|
def youtube_image(self, url):
|
|
# Youtube video id extraction regular expression from http://pastebin.com/KyKAFv1s
|
|
# If it matches, match.group(2) is the video id.
|
|
youtube_re = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
|
|
match = re.match(youtube_re, url)
|
|
if match is None:
|
|
return None
|
|
return "http://i.ytimg.com/vi/%s/default.jpg" % (match.group(2),)
|
|
|
|
# Search the tree for <a> tags and read their href values
|
|
def find_images(self, root):
|
|
def process_image_links(element):
|
|
if element.tag != "a":
|
|
return None
|
|
|
|
url = element.get("href")
|
|
youtube = self.youtube_image(url)
|
|
if youtube is not None:
|
|
return (youtube, url)
|
|
dropbox = self.dropbox_image(url)
|
|
if dropbox is not None:
|
|
return (dropbox, url)
|
|
if self.is_image(url):
|
|
return (url, url)
|
|
|
|
return walk_tree(root, process_image_links)
|
|
|
|
def run(self, root):
|
|
image_urls = self.find_images(root)
|
|
for (url, link) in image_urls:
|
|
a = markdown.util.etree.SubElement(root, "a")
|
|
a.set("href", link)
|
|
a.set("target", "_blank")
|
|
a.set("title", link)
|
|
img = markdown.util.etree.SubElement(a, "img")
|
|
img.set("src", url)
|
|
img.set("class", "message_inline_image")
|
|
|
|
return root
|
|
|
|
class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
|
|
def twitter_link(self, url):
|
|
parsed_url = urlparse.urlparse(url)
|
|
if not (parsed_url.netloc == 'twitter.com' or parsed_url.netloc.endswith('.twitter.com')):
|
|
return None
|
|
|
|
tweet_id_match = re.match(r'^/.*?/status/(\d{18})$', parsed_url.path)
|
|
if not tweet_id_match:
|
|
return None
|
|
|
|
tweet_id = tweet_id_match.groups()[0]
|
|
try:
|
|
if settings.TEST_SUITE:
|
|
import testing_mocks
|
|
res = testing_mocks.twitter(tweet_id)
|
|
else:
|
|
res = simplejson.load(urllib2.urlopen("https://api.twitter.com/1/statuses/show.json?id=%s" % tweet_id))
|
|
|
|
user = res['user']
|
|
tweet = markdown.util.etree.Element("div")
|
|
tweet.set("class", "twitter-tweet")
|
|
img_a = markdown.util.etree.SubElement(tweet, 'a')
|
|
img_a.set("href", url)
|
|
img_a.set("target", "_blank")
|
|
profile_img = markdown.util.etree.SubElement(img_a, 'img')
|
|
profile_img.set('class', 'twitter-avatar')
|
|
profile_img.set('src', user['profile_image_url_https'])
|
|
p = markdown.util.etree.SubElement(tweet, 'p')
|
|
p.text = res['text']
|
|
span = markdown.util.etree.SubElement(tweet, 'span')
|
|
span.text = "- %s (@%s)" % (user['name'], user['screen_name'])
|
|
|
|
return ('twitter', tweet)
|
|
except:
|
|
# We put this in its own try-except because it requires external
|
|
# connectivity. If Twitter flakes out, we don't want to not-render
|
|
# the entire message; we just want to not show the Twitter preview.
|
|
traceback.print_exc()
|
|
return None
|
|
|
|
# Search the tree for <a> tags and read their href values
|
|
def find_interesting_links(self, root):
|
|
def process_interesting_links(element):
|
|
if element.tag != "a":
|
|
return None
|
|
|
|
url = element.get("href")
|
|
return self.twitter_link(url)
|
|
|
|
return walk_tree(root, process_interesting_links)
|
|
|
|
def run(self, root):
|
|
interesting_links = self.find_interesting_links(root)
|
|
for (service_name, data) in interesting_links:
|
|
div = markdown.util.etree.SubElement(root, "div")
|
|
div.set("class", "inline-preview-%s" % service_name)
|
|
div.insert(0, data)
|
|
return root
|
|
|
|
class Gravatar(markdown.inlinepatterns.Pattern):
|
|
def handleMatch(self, match):
|
|
img = markdown.util.etree.Element('img')
|
|
img.set('class', 'message_body_gravatar img-rounded')
|
|
img.set('src', 'https://secure.gravatar.com/avatar/%s?d=identicon&s=30'
|
|
% (gravatar_hash(match.group('email')),))
|
|
return img
|
|
|
|
path_to_emoji = os.path.join(os.path.dirname(__file__), '..', '..',
|
|
# This should be zephyr/
|
|
'static', 'third', 'gemoji', 'images', 'emoji', '*.png')
|
|
emoji_list = [os.path.splitext(os.path.basename(fn))[0] for fn in glob.glob(path_to_emoji)]
|
|
|
|
def make_emoji(emoji_name, display_string):
|
|
elt = markdown.util.etree.Element('img')
|
|
elt.set('src', 'static/third/gemoji/images/emoji/%s.png' % (emoji_name,))
|
|
elt.set('class', 'emoji')
|
|
elt.set("alt", display_string)
|
|
elt.set("title", display_string)
|
|
return elt
|
|
|
|
class Emoji(markdown.inlinepatterns.Pattern):
|
|
def handleMatch(self, match):
|
|
orig_syntax = match.group("syntax")
|
|
name = orig_syntax[1:-1]
|
|
if name not in emoji_list:
|
|
return orig_syntax
|
|
return make_emoji(name, orig_syntax)
|
|
|
|
def fixup_link(link):
|
|
"""Set certain attributes we want on every link."""
|
|
link.set('target', '_blank')
|
|
link.set('title', link.get('href'))
|
|
|
|
|
|
def sanitize_url(url):
|
|
"""
|
|
Sanitize a url against xss attacks.
|
|
See the docstring on markdown.inlinepatterns.LinkPattern.sanitize_url.
|
|
"""
|
|
try:
|
|
parts = urlparse.urlparse(url.replace(' ', '%20'))
|
|
scheme, netloc, path, params, query, fragment = parts
|
|
except ValueError:
|
|
# Bad url - so bad it couldn't be parsed.
|
|
return ''
|
|
|
|
# Humbug modification: If scheme is not specified, assume http://
|
|
# It's unlikely that users want relative links within humbughq.com.
|
|
# We re-enter sanitize_url because netloc etc. need to be re-parsed.
|
|
if not scheme:
|
|
return sanitize_url('http://' + url)
|
|
|
|
locless_schemes = ['', 'mailto', 'news']
|
|
if netloc == '' and scheme not in locless_schemes:
|
|
# This fails regardless of anything else.
|
|
# Return immediately to save additional proccessing
|
|
return None
|
|
|
|
for part in parts[2:]:
|
|
if ":" in part:
|
|
# Not a safe url
|
|
return None
|
|
|
|
# Url passes all tests. Return url as-is.
|
|
return urlparse.urlunparse(parts)
|
|
|
|
def url_to_a(url):
|
|
a = markdown.util.etree.Element('a')
|
|
if '@' in url:
|
|
href = 'mailto:' + url
|
|
else:
|
|
href = url
|
|
|
|
href = sanitize_url(href)
|
|
if href is None:
|
|
# Rejected by sanitize_url; render it as plain text.
|
|
return url
|
|
|
|
a.set('href', href)
|
|
a.text = url
|
|
fixup_link(a)
|
|
return a
|
|
|
|
class AutoLink(markdown.inlinepatterns.Pattern):
|
|
def handleMatch(self, match):
|
|
url = match.group('url')
|
|
# As this will also match already-matched https?:// links,
|
|
# don't doubly-link them
|
|
if url[:5] == 'http:' or url[:6] == 'https:':
|
|
return url
|
|
return url_to_a(url)
|
|
|
|
class HttpLink(markdown.inlinepatterns.Pattern):
|
|
def handleMatch(self, match):
|
|
url = match.group('url')
|
|
return url_to_a(url)
|
|
|
|
class UListProcessor(markdown.blockprocessors.OListProcessor):
|
|
""" Process unordered list blocks.
|
|
|
|
Based on markdown.blockprocessors.UListProcessor, but does not accept
|
|
'+' or '-' as a bullet character."""
|
|
|
|
TAG = 'ul'
|
|
RE = re.compile(r'^[ ]{0,3}[*][ ]+(.*)')
|
|
|
|
class BugdownUListPreprocessor(markdown.preprocessors.Preprocessor):
|
|
""" Allows unordered list blocks that come directly after a
|
|
paragraph to be rendered as an unordered list
|
|
|
|
Detects paragraphs that have a matching list item that comes
|
|
directly after a line of text, and inserts a newline between
|
|
to satisfy Markdown"""
|
|
|
|
LI_RE = re.compile(r'^[ ]{0,3}[*][ ]+(.*)', re.MULTILINE)
|
|
HANGING_ULIST_RE = re.compile(r'^.+\n([ ]{0,3}[*][ ]+.*)', re.MULTILINE)
|
|
|
|
def run(self, lines):
|
|
""" Insert a newline between a paragraph and ulist if missing """
|
|
inserts = 0
|
|
fence = None
|
|
copy = lines[:]
|
|
for i in xrange(len(lines) - 1):
|
|
# Ignore anything that is inside a fenced code block
|
|
m = FENCE_RE.match(lines[i])
|
|
if not fence and m:
|
|
fence = m.group('fence')
|
|
elif fence and m and fence == m.group('fence'):
|
|
fence = None
|
|
|
|
# If we're not in a fenced block and we detect an upcoming list
|
|
# hanging off a paragraph, add a newline
|
|
if not fence and lines[i] and \
|
|
self.LI_RE.match(lines[i+1]) and not self.LI_RE.match(lines[i]):
|
|
copy.insert(i+inserts+1, '')
|
|
inserts += 1
|
|
return copy
|
|
|
|
# Based on markdown.inlinepatterns.LinkPattern
|
|
class LinkPattern(markdown.inlinepatterns.Pattern):
|
|
""" Return a link element from the given match. """
|
|
def handleMatch(self, m):
|
|
# Return the original link syntax as plain text,
|
|
# if the link fails checks.
|
|
orig_syntax = m.group(0)
|
|
|
|
href = m.group(9)
|
|
if not href:
|
|
return orig_syntax
|
|
|
|
if href[0] == "<":
|
|
href = href[1:-1]
|
|
href = sanitize_url(self.unescape(href.strip()))
|
|
if href is None:
|
|
return orig_syntax
|
|
|
|
el = markdown.util.etree.Element('a')
|
|
el.text = m.group(2)
|
|
el.set('href', href)
|
|
fixup_link(el)
|
|
return el
|
|
|
|
class Bugdown(markdown.Extension):
|
|
def extendMarkdown(self, md, md_globals):
|
|
del md.preprocessors['reference']
|
|
|
|
for k in ('image_link', 'image_reference', 'automail',
|
|
'autolink', 'link', 'reference', 'short_reference',
|
|
'escape', 'strong_em', 'emphasis', 'emphasis2',
|
|
'strong'):
|
|
del md.inlinePatterns[k]
|
|
|
|
# Custom bold syntax: **foo** but not __foo__
|
|
md.inlinePatterns.add('strong',
|
|
markdown.inlinepatterns.SimpleTagPattern(r'(\*\*)([^\n]+?)\2', 'strong'),
|
|
'>not_strong')
|
|
|
|
for k in ('hashheader', 'setextheader', 'olist', 'ulist'):
|
|
del md.parser.blockprocessors[k]
|
|
|
|
md.parser.blockprocessors.add('ulist', UListProcessor(md.parser), '>hr')
|
|
|
|
md.inlinePatterns.add('gravatar', Gravatar(r'!gravatar\((?P<email>[^)]*)\)'), '_begin')
|
|
md.inlinePatterns.add('emoji', Emoji(r'(?<!\S)(?P<syntax>:[^:\s]+:)(?!\S)'), '_begin')
|
|
md.inlinePatterns.add('link', LinkPattern(markdown.inlinepatterns.LINK_RE, md), '>backtick')
|
|
|
|
# markdown.inlinepatterns.Pattern compiles this with re.UNICODE, which
|
|
# is important because we're using \w.
|
|
#
|
|
# This rule must come after the built-in 'link' markdown linkifier to
|
|
# avoid errors.
|
|
http_link_regex = r'\b(?P<url>https?://[^\s]+?)(?=[^\w/]*(\s|\Z))'
|
|
md.inlinePatterns.add('http_autolink', HttpLink(http_link_regex), '>link')
|
|
|
|
# A link starts at a word boundary, and ends at space, punctuation, or end-of-input.
|
|
#
|
|
# We detect a url by checking for the TLD, and building around it.
|
|
#
|
|
# To support () in urls but not match ending ) when a url is inside a parenthesis,
|
|
# we match at maximum one set of matching parens in a url. We could extend this
|
|
# to match two parenthetical groups, at the cost of more regex complexity.
|
|
#
|
|
# This rule must come after the http_autolink rule we add above to avoid double
|
|
# linkifying.
|
|
tlds = '|'.join(['co.uk', 'com', 'co', 'biz', 'gd', 'org', 'net', 'ly', 'edu', 'mil',
|
|
'gov', 'info', 'me', 'it', '.ca', 'tv', 'fm', 'io', 'gl'])
|
|
link_regex = r"\b(?P<url>[^\s]+\.(%s)(?:/[^\s()\":]*?|([^\s()\":]*\([^\s()\":]*\)[^\s()\":]*))?)(?=([:;\?\),\.\'\"]\Z|[:;\?\),\.\'\"]\s|\Z|\s))" % (tlds,)
|
|
md.inlinePatterns.add('autolink', AutoLink(link_regex), '>http_autolink')
|
|
|
|
md.preprocessors.add('hanging_ulists',
|
|
BugdownUListPreprocessor(md),
|
|
"_begin")
|
|
|
|
md.treeprocessors.add("inline_images", InlineImagePreviewProcessor(md), "_end")
|
|
md.treeprocessors.add("inline_interesting_links", InlineInterestingLinkProcessor(md), "_end")
|
|
|
|
_md_engine = markdown.Markdown(
|
|
safe_mode = 'escape',
|
|
output_format = 'html',
|
|
extensions = ['nl2br',
|
|
codehilite.makeExtension(configs=[
|
|
('force_linenos', False),
|
|
('guess_lang', False)]),
|
|
fenced_code.makeExtension(),
|
|
Bugdown()])
|
|
|
|
# We want to log Markdown parser failures, but shouldn't log the actual input
|
|
# message for privacy reasons. The compromise is to replace all alphanumeric
|
|
# characters with 'x'.
|
|
#
|
|
# We also use repr() to improve reproducibility, and to escape terminal control
|
|
# codes, which can do surprisingly nasty things.
|
|
_privacy_re = re.compile(r'\w', flags=re.UNICODE)
|
|
def _sanitize_for_log(md):
|
|
return repr(_privacy_re.sub('x', md))
|
|
|
|
def convert(md):
|
|
"""Convert Markdown to HTML, with Humbug-specific settings and hacks."""
|
|
|
|
# Reset the parser; otherwise it will get slower over time.
|
|
_md_engine.reset()
|
|
|
|
try:
|
|
# Spend at most 5 seconds rendering.
|
|
# Sometimes Python-Markdown is really slow; see
|
|
# https://trac.humbughq.com/ticket/345
|
|
html = timeout(5, _md_engine.convert, md)
|
|
except:
|
|
from zephyr.models import Recipient
|
|
from zephyr.lib.actions import internal_send_message
|
|
|
|
cleaned = _sanitize_for_log(md)
|
|
|
|
html = '<p>[Humbug note: Sorry, we could not understand the formatting of your message]</p>'
|
|
|
|
# Output error to log as well as sending a humbug and email
|
|
logging.getLogger('').error('Exception in Markdown parser: %sInput (sanitized) was: %s'
|
|
% (traceback.format_exc(), cleaned))
|
|
subject = "Markdown parser failure"
|
|
internal_send_message("humbug+errors@humbughq.com",
|
|
Recipient.STREAM, "devel", subject,
|
|
"Markdown parser failed, message sent to devel@")
|
|
mail.mail_admins(subject, "Failed message: %s\n\n%s\n\n" % (
|
|
cleaned, traceback.format_exc()),
|
|
fail_silently=False)
|
|
|
|
return html
|