|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
try:
|
|
|
import _jsre as re
|
|
|
except:
|
|
|
import re
|
|
|
|
|
|
import random
|
|
|
import time
|
|
|
|
|
|
letters = 'abcdefghijklmnopqrstuvwxyz'
|
|
|
letters += letters.upper()+'0123456789'
|
|
|
|
|
|
class URL:
|
|
|
def __init__(self,src):
|
|
|
elts = src.split(maxsplit=1)
|
|
|
self.href = elts[0]
|
|
|
self.alt = ''
|
|
|
if len(elts)==2:
|
|
|
alt = elts[1]
|
|
|
if alt[0]=='"' and alt[-1]=='"':self.alt=alt[1:-1]
|
|
|
elif alt[0]=="'" and alt[-1]=="'":self.alt=alt[1:-1]
|
|
|
elif alt[0]=="(" and alt[-1]==")":self.alt=alt[1:-1]
|
|
|
|
|
|
class CodeBlock:
|
|
|
def __init__(self,line):
|
|
|
self.lines = [line]
|
|
|
if line.startswith("```") and len(line)>3:
|
|
|
self.info = line[3:]
|
|
|
else:
|
|
|
self.info = None
|
|
|
|
|
|
def to_html(self):
|
|
|
if self.lines[0].startswith("`"):
|
|
|
self.lines.pop(0)
|
|
|
res = escape('\n'.join(self.lines))
|
|
|
res = unmark(res)
|
|
|
_class = self.info or "marked"
|
|
|
res = '<pre class="%s">%s</pre>\n' %(_class, res)
|
|
|
return res,[]
|
|
|
|
|
|
class HtmlBlock:
|
|
|
|
|
|
def __init__(self, src):
|
|
|
self.src = src
|
|
|
|
|
|
def to_html(self):
|
|
|
return self.src
|
|
|
|
|
|
class Marked:
|
|
|
def __init__(self, line=''):
|
|
|
self.line = line
|
|
|
self.children = []
|
|
|
|
|
|
def to_html(self):
|
|
|
return apply_markdown(self.line)
|
|
|
|
|
|
# get references
|
|
|
refs = {}
|
|
|
ref_pattern = r"^\[(.*)\]:\s+(.*)"
|
|
|
|
|
|
def mark(src):
|
|
|
|
|
|
global refs
|
|
|
t0 = time.time()
|
|
|
refs = {}
|
|
|
# split source in sections
|
|
|
# sections can be :
|
|
|
# - a block-level HTML element (markdown syntax will not be processed)
|
|
|
# - a script
|
|
|
# - a span-level HTML tag (markdown syntax will be processed)
|
|
|
# - a code block
|
|
|
|
|
|
# normalise line feeds
|
|
|
src = src.replace('\r\n','\n')
|
|
|
|
|
|
# lines followed by dashes
|
|
|
src = re.sub(r'(.*?)\n=+\n', '\n# \\1\n', src)
|
|
|
src = re.sub(r'(.*?)\n-+\n', '\n## \\1\n', src)
|
|
|
|
|
|
lines = src.split('\n')+['']
|
|
|
|
|
|
i = bq = 0
|
|
|
ul = ol = 0
|
|
|
|
|
|
while i<len(lines):
|
|
|
|
|
|
# enclose lines starting by > in a blockquote
|
|
|
if lines[i].startswith('>'):
|
|
|
nb = 1
|
|
|
while nb<len(lines[i]) and lines[i][nb]=='>':
|
|
|
nb += 1
|
|
|
lines[i] = lines[i][nb:]
|
|
|
if nb>bq:
|
|
|
lines.insert(i,'<blockquote>'*(nb-bq))
|
|
|
i += 1
|
|
|
bq = nb
|
|
|
elif nb<bq:
|
|
|
lines.insert(i,'</blockquote>'*(bq-nb))
|
|
|
i += 1
|
|
|
bq = nb
|
|
|
elif bq>0:
|
|
|
lines.insert(i,'</blockquote>'*bq)
|
|
|
i += 1
|
|
|
bq = 0
|
|
|
|
|
|
# unordered lists
|
|
|
if lines[i].strip() and lines[i].lstrip()[0] in '-+*' \
|
|
|
and len(lines[i].lstrip())>1 \
|
|
|
and lines[i].lstrip()[1]==' ' \
|
|
|
and (i==0 or ul or not lines[i-1].strip()):
|
|
|
# line indentation indicates nesting level
|
|
|
nb = 1+len(lines[i])-len(lines[i].lstrip())
|
|
|
lines[i] = '<li>'+lines[i][nb:]
|
|
|
if nb>ul:
|
|
|
lines.insert(i,'<ul>'*(nb-ul))
|
|
|
i += 1
|
|
|
elif nb<ul:
|
|
|
lines.insert(i,'</ul>'*(ul-nb))
|
|
|
i += 1
|
|
|
ul = nb
|
|
|
elif ul and not lines[i].strip():
|
|
|
if i<len(lines)-1 and lines[i+1].strip() \
|
|
|
and not lines[i+1].startswith(' '):
|
|
|
nline = lines[i+1].lstrip()
|
|
|
if nline[0] in '-+*' and len(nline)>1 and nline[1]==' ':
|
|
|
pass
|
|
|
else:
|
|
|
lines.insert(i,'</ul>'*ul)
|
|
|
i += 1
|
|
|
ul = 0
|
|
|
|
|
|
# ordered lists
|
|
|
mo = re.search(r'^(\d+\.)',lines[i])
|
|
|
if mo:
|
|
|
if not ol:
|
|
|
lines.insert(i,'<ol>')
|
|
|
i += 1
|
|
|
lines[i] = '<li>'+lines[i][len(mo.groups()[0]):]
|
|
|
ol = 1
|
|
|
elif ol and not lines[i].strip() and i<len(lines)-1 \
|
|
|
and not lines[i+1].startswith(' ') \
|
|
|
and not re.search(r'^(\d+\.)',lines[i+1]):
|
|
|
lines.insert(i,'</ol>')
|
|
|
i += 1
|
|
|
ol = 0
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
if ul:
|
|
|
lines.append('</ul>'*ul)
|
|
|
if ol:
|
|
|
lines.append('</ol>'*ol)
|
|
|
if bq:
|
|
|
lines.append('</blockquote>'*bq)
|
|
|
|
|
|
t1 = time.time()
|
|
|
#print('part 1', t1-t0)
|
|
|
sections = []
|
|
|
scripts = []
|
|
|
section = Marked()
|
|
|
|
|
|
i = 0
|
|
|
while i<len(lines):
|
|
|
line = lines[i]
|
|
|
if line.strip() and line.startswith(' '):
|
|
|
if isinstance(section,Marked) and section.line:
|
|
|
sections.append(section)
|
|
|
section = CodeBlock(line[4:])
|
|
|
j = i+1
|
|
|
while j<len(lines) and lines[j].startswith(' '):
|
|
|
section.lines.append(lines[j][4:])
|
|
|
j += 1
|
|
|
sections.append(section)
|
|
|
section = Marked()
|
|
|
i = j
|
|
|
continue
|
|
|
|
|
|
elif line.strip() and line.startswith("```"):
|
|
|
# fenced code blocks à la Github Flavoured Markdown
|
|
|
if isinstance(section,Marked) and section.line:
|
|
|
sections.append(section)
|
|
|
section = CodeBlock(line)
|
|
|
j = i+1
|
|
|
while j<len(lines) and not lines[j].startswith("```"):
|
|
|
section.lines.append(lines[j])
|
|
|
j += 1
|
|
|
sections.append(section)
|
|
|
section = Marked()
|
|
|
i = j+1
|
|
|
continue
|
|
|
|
|
|
elif line.lower().startswith('<script'):
|
|
|
if isinstance(section,Marked) and section.line:
|
|
|
sections.append(section)
|
|
|
section = Marked()
|
|
|
j = i+1
|
|
|
while j<len(lines):
|
|
|
if lines[j].lower().startswith('</script>'):
|
|
|
scripts.append('\n'.join(lines[i+1:j]))
|
|
|
for k in range(i,j+1):
|
|
|
lines[k] = ''
|
|
|
break
|
|
|
j += 1
|
|
|
i = j
|
|
|
continue
|
|
|
|
|
|
# atext header
|
|
|
elif line.startswith('#'):
|
|
|
level = 1
|
|
|
line = lines[i]
|
|
|
while level<len(line) and line[level]=='#' and level<=6:
|
|
|
level += 1
|
|
|
if not line[level+1:].strip():
|
|
|
if level==1:
|
|
|
i += 1
|
|
|
continue
|
|
|
else:
|
|
|
lines[i] = '<H%s>%s</H%s>\n' %(level-1,'#',level-1)
|
|
|
else:
|
|
|
lines[i] = '<H%s>%s</H%s>\n' %(level,line[level+1:],level)
|
|
|
|
|
|
else:
|
|
|
mo = re.search(ref_pattern,line)
|
|
|
if mo is not None:
|
|
|
if isinstance(section,Marked) and section.line:
|
|
|
sections.append(section)
|
|
|
section = Marked()
|
|
|
key = mo.groups()[0]
|
|
|
value = URL(mo.groups()[1])
|
|
|
refs[key.lower()] = value
|
|
|
else:
|
|
|
if not line.strip():
|
|
|
line = '<p></p>'
|
|
|
if section.line:
|
|
|
section.line += '\n'
|
|
|
section.line += line
|
|
|
|
|
|
i += 1
|
|
|
t2 = time.time()
|
|
|
#print('section 2', t2-t1)
|
|
|
if isinstance(section,Marked) and section.line:
|
|
|
sections.append(section)
|
|
|
|
|
|
res = ''
|
|
|
for section in sections:
|
|
|
mk,_scripts = section.to_html()
|
|
|
res += mk
|
|
|
scripts += _scripts
|
|
|
#print('end mark', time.time()-t2)
|
|
|
return res,scripts
|
|
|
|
|
|
def escape(czone):
|
|
|
czone = czone.replace('&','&')
|
|
|
czone = czone.replace('<','<')
|
|
|
czone = czone.replace('>','>')
|
|
|
czone = czone.replace('_','_')
|
|
|
czone = czone.replace('*','*')
|
|
|
return czone
|
|
|
|
|
|
def s_escape(mo):
|
|
|
# used in re.sub
|
|
|
czone = mo.string[mo.start():mo.end()]
|
|
|
return escape(czone)
|
|
|
|
|
|
def unmark(code_zone):
|
|
|
# convert _ to _ inside inline code
|
|
|
code_zone = code_zone.replace('_','_')
|
|
|
return code_zone
|
|
|
|
|
|
def s_unmark(mo):
|
|
|
# convert _ to _ inside inline code
|
|
|
code_zone = mo.string[mo.start():mo.end()]
|
|
|
code_zone = code_zone.replace('_','_')
|
|
|
return code_zone
|
|
|
|
|
|
def apply_markdown(src):
|
|
|
|
|
|
scripts = []
|
|
|
key = None
|
|
|
|
|
|
t0 = time.time()
|
|
|
i = 0
|
|
|
while i<len(src):
|
|
|
if src[i]=='[':
|
|
|
start_a = i+1
|
|
|
while True:
|
|
|
end_a = src.find(']',i)
|
|
|
if end_a == -1:
|
|
|
break
|
|
|
if src[end_a-1]=='\\':
|
|
|
i = end_a+1
|
|
|
else:
|
|
|
break
|
|
|
if end_a>-1 and src[start_a:end_a].find('\n')==-1:
|
|
|
link = src[start_a:end_a]
|
|
|
rest = src[end_a+1:].lstrip()
|
|
|
if rest and rest[0]=='(':
|
|
|
j = 0
|
|
|
while True:
|
|
|
end_href = rest.find(')',j)
|
|
|
if end_href == -1:
|
|
|
break
|
|
|
if rest[end_href-1]=='\\':
|
|
|
j = end_href+1
|
|
|
else:
|
|
|
break
|
|
|
if end_href>-1 and rest[:end_href].find('\n')==-1:
|
|
|
tag = '<a href="'+rest[1:end_href]+'">'+link+'</a>'
|
|
|
src = src[:start_a-1]+tag+rest[end_href+1:]
|
|
|
i = start_a+len(tag)
|
|
|
elif rest and rest[0]=='[':
|
|
|
j = 0
|
|
|
while True:
|
|
|
end_key = rest.find(']',j)
|
|
|
if end_key == -1:
|
|
|
break
|
|
|
if rest[end_key-1]=='\\':
|
|
|
j = end_key+1
|
|
|
else:
|
|
|
break
|
|
|
if end_key>-1 and rest[:end_key].find('\n')==-1:
|
|
|
if not key:
|
|
|
key = link
|
|
|
if key.lower() not in refs:
|
|
|
raise KeyError('unknown reference %s' %key)
|
|
|
url = refs[key.lower()]
|
|
|
tag = '<a href="'+url+'">'+link+'</a>'
|
|
|
src = src[:start_a-1]+tag+rest[end_key+1:]
|
|
|
i = start_a+len(tag)
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
t1 = time.time()
|
|
|
#print('apply markdown 1', t1-t0)
|
|
|
# before applying the markup with _ and *, isolate HTML tags because
|
|
|
# they can contain these characters
|
|
|
|
|
|
# We replace them temporarily by a random string
|
|
|
rstr = ''.join(random.choice(letters) for i in range(16))
|
|
|
|
|
|
i = 0
|
|
|
state = None
|
|
|
start = -1
|
|
|
data = ''
|
|
|
tags = []
|
|
|
while i<len(src):
|
|
|
if src[i]=='<':
|
|
|
j = i+1
|
|
|
while j<len(src):
|
|
|
if src[j]=='"' or src[j]=="'":
|
|
|
if state==src[j] and src[j-1]!='\\':
|
|
|
state = None
|
|
|
j = start+len(data)+1
|
|
|
data = ''
|
|
|
elif state==None:
|
|
|
state = src[j]
|
|
|
start = j
|
|
|
else:
|
|
|
data += src[j]
|
|
|
elif src[j]=='>' and state is None:
|
|
|
tags.append(src[i:j+1])
|
|
|
src = src[:i]+rstr+src[j+1:]
|
|
|
i += len(rstr)
|
|
|
break
|
|
|
elif state=='"' or state=="'":
|
|
|
data += src[j]
|
|
|
elif src[j]=='\n':
|
|
|
# if a sign < is not followed by > in the same ligne, it
|
|
|
# is the sign "lesser than"
|
|
|
src = src[:i]+'<'+src[i+1:]
|
|
|
j=i+4
|
|
|
break
|
|
|
j += 1
|
|
|
elif src[i]=='`' and i>0 and src[i-1]!='\\':
|
|
|
# ignore the content of inline code
|
|
|
j = i+1
|
|
|
while j<len(src):
|
|
|
if src[j]=='`' and src[j-1]!='\\':
|
|
|
break
|
|
|
j += 1
|
|
|
i = j
|
|
|
i += 1
|
|
|
|
|
|
t2 = time.time()
|
|
|
#print('apply markdown 2', len(src), t2-t1)
|
|
|
|
|
|
# escape "<", ">", "&" and "_" in inline code
|
|
|
code_pattern = r'\`(.*?)\`'
|
|
|
src = re.sub(code_pattern,s_escape,src)
|
|
|
|
|
|
# replace escaped ` _ * by HTML characters
|
|
|
src = src.replace(r'\\`','`')
|
|
|
src = src.replace(r'\_','_')
|
|
|
src = src.replace(r'\*','*')
|
|
|
|
|
|
# emphasis
|
|
|
strong_patterns = [('STRONG',r'\*\*(.*?)\*\*'),('B',r'__(.*?)__')]
|
|
|
for tag,strong_pattern in strong_patterns:
|
|
|
src = re.sub(strong_pattern,r'<%s>\1</%s>' %(tag,tag),src)
|
|
|
|
|
|
em_patterns = [('EM',r'\*(.*?)\*'),('I',r'\_(.*?)\_')]
|
|
|
for tag,em_pattern in em_patterns:
|
|
|
src = re.sub(em_pattern,r'<%s>\1</%s>' %(tag,tag),src)
|
|
|
|
|
|
# inline code
|
|
|
code_pattern = r'\`(.*?)\`'
|
|
|
src = re.sub(code_pattern,r'<code>\1</code>',src)
|
|
|
|
|
|
# restore tags
|
|
|
while True:
|
|
|
pos = src.rfind(rstr)
|
|
|
if pos==-1:
|
|
|
break
|
|
|
repl = tags.pop()
|
|
|
src = src[:pos]+repl+src[pos+len(rstr):]
|
|
|
|
|
|
src = '<p>'+src+'</p>'
|
|
|
|
|
|
t3 = time.time()
|
|
|
#print('apply markdown 3', t3-t2)
|
|
|
|
|
|
return src,scripts
|
|
|
|