Files
zulip/zephyr/lib/bugdown/__init__.py
Keegan McAllister 6d9e0095eb bugdown: Disable reference-based links
This is syntax like

    Here's [a link][]

    [a link]: http://google.com

This is not very useful for short chat-style messages.  It will confuse users,
especially because we don't document it.  And disabling it saves the effort of
applying the same link fixups as elsewhere.

(imported from commit c23391465486db545302b79c084b4f9cd5cdcc6a)
2012-12-05 13:54:43 -05:00

156 lines
5.4 KiB
Python

import markdown
import logging
import traceback
import urlparse
import re
from zephyr.lib.avatar import gravatar_hash
from zephyr.lib.bugdown import codehilite, fenced_code
class Gravatar(markdown.inlinepatterns.Pattern):
def handleMatch(self, match):
img = markdown.util.etree.Element('img')
img.set('class', 'message_body_gravatar img-rounded')
img.set('src', 'https://secure.gravatar.com/avatar/%s?d=identicon&s=30'
% (gravatar_hash(match.group('email')),))
return img
def fixup_link(link):
"""Set certain attributes we want on every link."""
link.set('target', '_blank')
link.set('title', link.get('href'))
class AutoLink(markdown.inlinepatterns.Pattern):
def handleMatch(self, match):
url = match.group('url')
a = markdown.util.etree.Element('a')
a.set('href', url)
a.text = url
fixup_link(a)
return a
class UListProcessor(markdown.blockprocessors.OListProcessor):
""" Process unordered list blocks.
Based on markdown.blockprocessors.UListProcessor, but does not accept
'+' as a bullet character."""
TAG = 'ul'
RE = re.compile(r'^[ ]{0,3}[*-][ ]+(.*)')
# Based on markdown.inlinepatterns.LinkPattern
class LinkPattern(markdown.inlinepatterns.Pattern):
""" Return a link element from the given match. """
def handleMatch(self, m):
el = markdown.util.etree.Element("a")
el.text = m.group(2)
href = m.group(9)
if href:
if href[0] == "<":
href = href[1:-1]
el.set("href", self.sanitize_url(self.unescape(href.strip())))
else:
el.set("href", "")
fixup_link(el)
return el
def sanitize_url(self, url):
"""
Sanitize a url against xss attacks.
See the docstring on markdown.inlinepatterns.LinkPattern.sanitize_url.
"""
try:
parts = urlparse.urlparse(url.replace(' ', '%20'))
scheme, netloc, path, params, query, fragment = parts
except ValueError:
# Bad url - so bad it couldn't be parsed.
return ''
# Humbug modification: If scheme is not specified, assume http://
# It's unlikely that users want relative links within humbughq.com.
# We re-enter sanitize_url because netloc etc. need to be re-parsed.
if not scheme:
return self.sanitize_url('http://' + url)
locless_schemes = ['', 'mailto', 'news']
if netloc == '' and scheme not in locless_schemes:
# This fails regardless of anything else.
# Return immediately to save additional proccessing
return ''
for part in parts[2:]:
if ":" in part:
# Not a safe url
return ''
# Url passes all tests. Return url as-is.
return urlparse.urlunparse(parts)
class Bugdown(markdown.Extension):
def extendMarkdown(self, md, md_globals):
del md.preprocessors['reference']
for k in ('image_link', 'image_reference', 'automail',
'autolink', 'link', 'reference', 'short_reference'):
del md.inlinePatterns[k]
for k in ('hashheader', 'setextheader', 'olist', 'ulist'):
del md.parser.blockprocessors[k]
md.parser.blockprocessors.add('ulist', UListProcessor(md.parser), '>hr')
md.inlinePatterns.add('gravatar', Gravatar(r'!gravatar\((?P<email>[^)]*)\)'), '_begin')
md.inlinePatterns.add('link', LinkPattern(markdown.inlinepatterns.LINK_RE, md), '>escape')
# A link starts at a word boundary, and ends at space or end-of-input.
# But any trailing punctuation (other than /) is not included.
# We accomplish this with a non-greedy match followed by a greedy
# lookahead assertion.
#
# markdown.inlinepatterns.Pattern compiles this with re.UNICODE, which
# is important because we're using \w.
link_regex = r'\b(?P<url>https?://[^\s]+?)(?=[^\w/]*(\s|\Z))'
md.inlinePatterns.add('autolink', AutoLink(link_regex), '>link')
_md_engine = markdown.Markdown(
safe_mode = 'escape',
output_format = 'html',
extensions = ['nl2br',
codehilite.makeExtension(configs=[
('force_linenos', False),
('guess_lang', False)]),
fenced_code.makeExtension(),
Bugdown()])
# We want to log Markdown parser failures, but shouldn't log the actual input
# message for privacy reasons. The compromise is to replace all alphanumeric
# characters with 'x'.
#
# We also use repr() to improve reproducibility, and to escape terminal control
# codes, which can do surprisingly nasty things.
_privacy_re = re.compile(r'\w', flags=re.UNICODE)
def _sanitize_for_log(md):
return repr(_privacy_re.sub('x', md))
def _linkify(match):
url = match.group('url')
return ' [%s](%s) ' % (url, url)
def convert(md):
"""Convert Markdown to HTML, with Humbug-specific settings and hacks."""
# Reset the parser; otherwise it will get slower over time.
_md_engine.reset()
try:
html = _md_engine.convert(md)
except:
# FIXME: Do something more reasonable here!
html = '<p>[Humbug note: Sorry, we could not understand the formatting of your message]</p>'
logging.getLogger('').error('Exception in Markdown parser: %sInput (sanitized) was: %s'
% (traceback.format_exc(), _sanitize_for_log(md)))
return html