Show More
Commit Description:
Merge pull request #17 from nattee/master...
Commit Description:
Merge pull request #17 from nattee/master
upgrade to current working snapshot
References:
File last commit:
Show/Diff file:
Action:
lib/assets/Lib/html/parser.py
| 532 lines
| 19.7 KiB
| text/x-python
| PythonLexer
|
r584 | """A parser for HTML and XHTML.""" | |||
# This file is based on sgmllib.py, but the API is slightly different. | ||||
# XXX There should be a way to distinguish between PCDATA (parsed | ||||
# character data -- the normal case), RCDATA (replaceable character | ||||
# data -- only char and entity references and end tags are special) | ||||
# and CDATA (character data -- only end tags are special). | ||||
import _markupbase | ||||
import re | ||||
import warnings | ||||
# Regular expressions used for parsing | ||||
interesting_normal = re.compile('[&<]') | ||||
incomplete = re.compile('&[a-zA-Z#]') | ||||
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') | ||||
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') | ||||
starttagopen = re.compile('<[a-zA-Z]') | ||||
piclose = re.compile('>') | ||||
commentclose = re.compile(r'--\s*>') | ||||
tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*') | ||||
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state | ||||
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state | ||||
tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') | ||||
# Note: | ||||
# 1) the strict attrfind isn't really strict, but we can't make it | ||||
# correctly strict without breaking backward compatibility; | ||||
# 2) if you change attrfind remember to update locatestarttagend too; | ||||
# 3) if you change attrfind and/or locatestarttagend the parser will | ||||
# explode, so don't do it. | ||||
attrfind = re.compile( | ||||
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' | ||||
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') | ||||
attrfind_tolerant = re.compile( | ||||
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' | ||||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') | ||||
locatestarttagend = re.compile(r""" | ||||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name | ||||
(?:\s+ # whitespace before attribute name | ||||
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name | ||||
(?:\s*=\s* # value indicator | ||||
(?:'[^']*' # LITA-enclosed value | ||||
|\"[^\"]*\" # LIT-enclosed value | ||||
|[^'\">\s]+ # bare value | ||||
) | ||||
)? | ||||
) | ||||
)* | ||||
\s* # trailing whitespace | ||||
""", re.VERBOSE) | ||||
locatestarttagend_tolerant = re.compile(r""" | ||||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name | ||||
(?:[\s/]* # optional whitespace before attribute name | ||||
(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name | ||||
(?:\s*=+\s* # value indicator | ||||
(?:'[^']*' # LITA-enclosed value | ||||
|"[^"]*" # LIT-enclosed value | ||||
|(?!['"])[^>\s]* # bare value | ||||
) | ||||
(?:\s*,)* # possibly followed by a comma | ||||
)?(?:\s|/(?!>))* | ||||
)* | ||||
)? | ||||
\s* # trailing whitespace | ||||
""", re.VERBOSE) | ||||
endendtag = re.compile('>') | ||||
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between | ||||
# </ and the tag name, so maybe this should be fixed | ||||
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') | ||||
class HTMLParseError(Exception): | ||||
"""Exception raised for all parse errors.""" | ||||
def __init__(self, msg, position=(None, None)): | ||||
assert msg | ||||
self.msg = msg | ||||
self.lineno = position[0] | ||||
self.offset = position[1] | ||||
def __str__(self): | ||||
result = self.msg | ||||
if self.lineno is not None: | ||||
result = result + ", at line %d" % self.lineno | ||||
if self.offset is not None: | ||||
result = result + ", column %d" % (self.offset + 1) | ||||
return result | ||||
class HTMLParser(_markupbase.ParserBase): | ||||
"""Find tags and other markup and call handler functions. | ||||
Usage: | ||||
p = HTMLParser() | ||||
p.feed(data) | ||||
... | ||||
p.close() | ||||
Start tags are handled by calling self.handle_starttag() or | ||||
self.handle_startendtag(); end tags by self.handle_endtag(). The | ||||
data between tags is passed from the parser to the derived class | ||||
by calling self.handle_data() with the data as argument (the data | ||||
may be split up in arbitrary chunks). Entity references are | ||||
passed by calling self.handle_entityref() with the entity | ||||
reference as the argument. Numeric character references are | ||||
passed to self.handle_charref() with the string containing the | ||||
reference as the argument. | ||||
""" | ||||
CDATA_CONTENT_ELEMENTS = ("script", "style") | ||||
def __init__(self, strict=False): | ||||
"""Initialize and reset this instance. | ||||
If strict is set to False (the default) the parser will parse invalid | ||||
markup, otherwise it will raise an error. Note that the strict mode | ||||
is deprecated. | ||||
""" | ||||
if strict: | ||||
warnings.warn("The strict mode is deprecated.", | ||||
DeprecationWarning, stacklevel=2) | ||||
self.strict = strict | ||||
self.reset() | ||||
def reset(self): | ||||
"""Reset this instance. Loses all unprocessed data.""" | ||||
self.rawdata = '' | ||||
self.lasttag = '???' | ||||
self.interesting = interesting_normal | ||||
self.cdata_elem = None | ||||
_markupbase.ParserBase.reset(self) | ||||
def feed(self, data): | ||||
r"""Feed data to the parser. | ||||
Call this as often as you want, with as little or as much text | ||||
as you want (may include '\n'). | ||||
""" | ||||
self.rawdata = self.rawdata + data | ||||
self.goahead(0) | ||||
def close(self): | ||||
"""Handle any buffered data.""" | ||||
self.goahead(1) | ||||
def error(self, message): | ||||
raise HTMLParseError(message, self.getpos()) | ||||
__starttag_text = None | ||||
def get_starttag_text(self): | ||||
"""Return full source of start tag: '<...>'.""" | ||||
return self.__starttag_text | ||||
def set_cdata_mode(self, elem): | ||||
self.cdata_elem = elem.lower() | ||||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) | ||||
def clear_cdata_mode(self): | ||||
self.interesting = interesting_normal | ||||
self.cdata_elem = None | ||||
# Internal -- handle data as far as reasonable. May leave state | ||||
# and data to be processed by a subsequent call. If 'end' is | ||||
# true, force handling all data as if followed by EOF marker. | ||||
def goahead(self, end): | ||||
rawdata = self.rawdata | ||||
i = 0 | ||||
n = len(rawdata) | ||||
while i < n: | ||||
match = self.interesting.search(rawdata, i) # < or & | ||||
if match: | ||||
j = match.start() | ||||
else: | ||||
if self.cdata_elem: | ||||
break | ||||
j = n | ||||
if i < j: self.handle_data(rawdata[i:j]) | ||||
i = self.updatepos(i, j) | ||||
if i == n: break | ||||
startswith = rawdata.startswith | ||||
if startswith('<', i): | ||||
if starttagopen.match(rawdata, i): # < + letter | ||||
k = self.parse_starttag(i) | ||||
elif startswith("</", i): | ||||
k = self.parse_endtag(i) | ||||
elif startswith("<!--", i): | ||||
k = self.parse_comment(i) | ||||
elif startswith("<?", i): | ||||
k = self.parse_pi(i) | ||||
elif startswith("<!", i): | ||||
if self.strict: | ||||
k = self.parse_declaration(i) | ||||
else: | ||||
k = self.parse_html_declaration(i) | ||||
elif (i + 1) < n: | ||||
self.handle_data("<") | ||||
k = i + 1 | ||||
else: | ||||
break | ||||
if k < 0: | ||||
if not end: | ||||
break | ||||
if self.strict: | ||||
self.error("EOF in middle of construct") | ||||
k = rawdata.find('>', i + 1) | ||||
if k < 0: | ||||
k = rawdata.find('<', i + 1) | ||||
if k < 0: | ||||
k = i + 1 | ||||
else: | ||||
k += 1 | ||||
self.handle_data(rawdata[i:k]) | ||||
i = self.updatepos(i, k) | ||||
elif startswith("&#", i): | ||||
match = charref.match(rawdata, i) | ||||
if match: | ||||
name = match.group()[2:-1] | ||||
self.handle_charref(name) | ||||
k = match.end() | ||||
if not startswith(';', k-1): | ||||
k = k - 1 | ||||
i = self.updatepos(i, k) | ||||
continue | ||||
else: | ||||
if ";" in rawdata[i:]: #bail by consuming &# | ||||
self.handle_data(rawdata[0:2]) | ||||
i = self.updatepos(i, 2) | ||||
break | ||||
elif startswith('&', i): | ||||
match = entityref.match(rawdata, i) | ||||
if match: | ||||
name = match.group(1) | ||||
self.handle_entityref(name) | ||||
k = match.end() | ||||
if not startswith(';', k-1): | ||||
k = k - 1 | ||||
i = self.updatepos(i, k) | ||||
continue | ||||
match = incomplete.match(rawdata, i) | ||||
if match: | ||||
# match.group() will contain at least 2 chars | ||||
if end and match.group() == rawdata[i:]: | ||||
if self.strict: | ||||
self.error("EOF in middle of entity or char ref") | ||||
else: | ||||
k = match.end() | ||||
if k <= i: | ||||
k = n | ||||
i = self.updatepos(i, i + 1) | ||||
# incomplete | ||||
break | ||||
elif (i + 1) < n: | ||||
# not the end of the buffer, and can't be confused | ||||
# with some other construct | ||||
self.handle_data("&") | ||||
i = self.updatepos(i, i + 1) | ||||
else: | ||||
break | ||||
else: | ||||
assert 0, "interesting.search() lied" | ||||
# end while | ||||
if end and i < n and not self.cdata_elem: | ||||
self.handle_data(rawdata[i:n]) | ||||
i = self.updatepos(i, n) | ||||
self.rawdata = rawdata[i:] | ||||
# Internal -- parse html declarations, return length or -1 if not terminated | ||||
# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state | ||||
# See also parse_declaration in _markupbase | ||||
def parse_html_declaration(self, i): | ||||
rawdata = self.rawdata | ||||
assert rawdata[i:i+2] == '<!', ('unexpected call to ' | ||||
'parse_html_declaration()') | ||||
if rawdata[i:i+4] == '<!--': | ||||
# this case is actually already handled in goahead() | ||||
return self.parse_comment(i) | ||||
elif rawdata[i:i+3] == '<![': | ||||
return self.parse_marked_section(i) | ||||
elif rawdata[i:i+9].lower() == '<!doctype': | ||||
# find the closing > | ||||
gtpos = rawdata.find('>', i+9) | ||||
if gtpos == -1: | ||||
return -1 | ||||
self.handle_decl(rawdata[i+2:gtpos]) | ||||
return gtpos+1 | ||||
else: | ||||
return self.parse_bogus_comment(i) | ||||
# Internal -- parse bogus comment, return length or -1 if not terminated | ||||
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state | ||||
def parse_bogus_comment(self, i, report=1): | ||||
rawdata = self.rawdata | ||||
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to ' | ||||
'parse_comment()') | ||||
pos = rawdata.find('>', i+2) | ||||
if pos == -1: | ||||
return -1 | ||||
if report: | ||||
self.handle_comment(rawdata[i+2:pos]) | ||||
return pos + 1 | ||||
# Internal -- parse processing instr, return end or -1 if not terminated | ||||
def parse_pi(self, i): | ||||
rawdata = self.rawdata | ||||
assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' | ||||
match = piclose.search(rawdata, i+2) # > | ||||
if not match: | ||||
return -1 | ||||
j = match.start() | ||||
self.handle_pi(rawdata[i+2: j]) | ||||
j = match.end() | ||||
return j | ||||
# Internal -- handle starttag, return end or -1 if not terminated | ||||
def parse_starttag(self, i): | ||||
self.__starttag_text = None | ||||
endpos = self.check_for_whole_start_tag(i) | ||||
if endpos < 0: | ||||
return endpos | ||||
rawdata = self.rawdata | ||||
self.__starttag_text = rawdata[i:endpos] | ||||
# Now parse the data between i+1 and j into a tag and attrs | ||||
attrs = [] | ||||
match = tagfind.match(rawdata, i+1) | ||||
assert match, 'unexpected call to parse_starttag()' | ||||
k = match.end() | ||||
self.lasttag = tag = match.group(1).lower() | ||||
while k < endpos: | ||||
if self.strict: | ||||
m = attrfind.match(rawdata, k) | ||||
else: | ||||
m = attrfind_tolerant.match(rawdata, k) | ||||
if not m: | ||||
break | ||||
attrname, rest, attrvalue = m.group(1, 2, 3) | ||||
if not rest: | ||||
attrvalue = None | ||||
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ | ||||
attrvalue[:1] == '"' == attrvalue[-1:]: | ||||
attrvalue = attrvalue[1:-1] | ||||
if attrvalue: | ||||
attrvalue = self.unescape(attrvalue) | ||||
attrs.append((attrname.lower(), attrvalue)) | ||||
k = m.end() | ||||
end = rawdata[k:endpos].strip() | ||||
if end not in (">", "/>"): | ||||
lineno, offset = self.getpos() | ||||
if "\n" in self.__starttag_text: | ||||
lineno = lineno + self.__starttag_text.count("\n") | ||||
offset = len(self.__starttag_text) \ | ||||
- self.__starttag_text.rfind("\n") | ||||
else: | ||||
offset = offset + len(self.__starttag_text) | ||||
if self.strict: | ||||
self.error("junk characters in start tag: %r" | ||||
% (rawdata[k:endpos][:20],)) | ||||
self.handle_data(rawdata[i:endpos]) | ||||
return endpos | ||||
if end.endswith('/>'): | ||||
# XHTML-style empty tag: <span attr="value" /> | ||||
self.handle_startendtag(tag, attrs) | ||||
else: | ||||
self.handle_starttag(tag, attrs) | ||||
if tag in self.CDATA_CONTENT_ELEMENTS: | ||||
self.set_cdata_mode(tag) | ||||
return endpos | ||||
# Internal -- check to see if we have a complete starttag; return end | ||||
# or -1 if incomplete. | ||||
def check_for_whole_start_tag(self, i): | ||||
rawdata = self.rawdata | ||||
if self.strict: | ||||
m = locatestarttagend.match(rawdata, i) | ||||
else: | ||||
m = locatestarttagend_tolerant.match(rawdata, i) | ||||
if m: | ||||
j = m.end() | ||||
next = rawdata[j:j+1] | ||||
if next == ">": | ||||
return j + 1 | ||||
if next == "/": | ||||
if rawdata.startswith("/>", j): | ||||
return j + 2 | ||||
if rawdata.startswith("/", j): | ||||
# buffer boundary | ||||
return -1 | ||||
# else bogus input | ||||
if self.strict: | ||||
self.updatepos(i, j + 1) | ||||
self.error("malformed empty start tag") | ||||
if j > i: | ||||
return j | ||||
else: | ||||
return i + 1 | ||||
if next == "": | ||||
# end of input | ||||
return -1 | ||||
if next in ("abcdefghijklmnopqrstuvwxyz=/" | ||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"): | ||||
# end of input in or before attribute value, or we have the | ||||
# '/' from a '/>' ending | ||||
return -1 | ||||
if self.strict: | ||||
self.updatepos(i, j) | ||||
self.error("malformed start tag") | ||||
if j > i: | ||||
return j | ||||
else: | ||||
return i + 1 | ||||
raise AssertionError("we should not get here!") | ||||
# Internal -- parse endtag, return end or -1 if incomplete | ||||
def parse_endtag(self, i): | ||||
rawdata = self.rawdata | ||||
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" | ||||
match = endendtag.search(rawdata, i+1) # > | ||||
if not match: | ||||
return -1 | ||||
gtpos = match.end() | ||||
match = endtagfind.match(rawdata, i) # </ + tag + > | ||||
if not match: | ||||
if self.cdata_elem is not None: | ||||
self.handle_data(rawdata[i:gtpos]) | ||||
return gtpos | ||||
if self.strict: | ||||
self.error("bad end tag: %r" % (rawdata[i:gtpos],)) | ||||
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state | ||||
namematch = tagfind_tolerant.match(rawdata, i+2) | ||||
if not namematch: | ||||
# w3.org/TR/html5/tokenization.html#end-tag-open-state | ||||
if rawdata[i:i+3] == '</>': | ||||
return i+3 | ||||
else: | ||||
return self.parse_bogus_comment(i) | ||||
tagname = namematch.group().lower() | ||||
# consume and ignore other stuff between the name and the > | ||||
# Note: this is not 100% correct, since we might have things like | ||||
# </tag attr=">">, but looking for > after tha name should cover | ||||
# most of the cases and is much simpler | ||||
gtpos = rawdata.find('>', namematch.end()) | ||||
self.handle_endtag(tagname) | ||||
return gtpos+1 | ||||
elem = match.group(1).lower() # script or style | ||||
if self.cdata_elem is not None: | ||||
if elem != self.cdata_elem: | ||||
self.handle_data(rawdata[i:gtpos]) | ||||
return gtpos | ||||
self.handle_endtag(elem.lower()) | ||||
self.clear_cdata_mode() | ||||
return gtpos | ||||
# Overridable -- finish processing of start+end tag: <tag.../> | ||||
def handle_startendtag(self, tag, attrs): | ||||
self.handle_starttag(tag, attrs) | ||||
self.handle_endtag(tag) | ||||
# Overridable -- handle start tag | ||||
def handle_starttag(self, tag, attrs): | ||||
pass | ||||
# Overridable -- handle end tag | ||||
def handle_endtag(self, tag): | ||||
pass | ||||
# Overridable -- handle character reference | ||||
def handle_charref(self, name): | ||||
pass | ||||
# Overridable -- handle entity reference | ||||
def handle_entityref(self, name): | ||||
pass | ||||
# Overridable -- handle data | ||||
def handle_data(self, data): | ||||
pass | ||||
# Overridable -- handle comment | ||||
def handle_comment(self, data): | ||||
pass | ||||
# Overridable -- handle declaration | ||||
def handle_decl(self, decl): | ||||
pass | ||||
# Overridable -- handle processing instruction | ||||
def handle_pi(self, data): | ||||
pass | ||||
def unknown_decl(self, data): | ||||
if self.strict: | ||||
self.error("unknown declaration: %r" % (data,)) | ||||
# Internal -- helper to remove special character quoting | ||||
def unescape(self, s): | ||||
if '&' not in s: | ||||
return s | ||||
def replaceEntities(s): | ||||
s = s.groups()[0] | ||||
try: | ||||
if s[0] == "#": | ||||
s = s[1:] | ||||
if s[0] in ['x','X']: | ||||
c = int(s[1:].rstrip(';'), 16) | ||||
else: | ||||
c = int(s.rstrip(';')) | ||||
return chr(c) | ||||
except ValueError: | ||||
return '&#' + s | ||||
else: | ||||
from html.entities import html5 | ||||
if s in html5: | ||||
return html5[s] | ||||
elif s.endswith(';'): | ||||
return '&' + s | ||||
for x in range(2, len(s)): | ||||
if s[:x] in html5: | ||||
return html5[s[:x]] + s[x:] | ||||
else: | ||||
return '&' + s | ||||
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))", | ||||
replaceEntities, s, flags=re.ASCII) | ||||