Show More
Commit Description:
Merge pull request #17 from nattee/master...
Commit Description:
Merge pull request #17 from nattee/master
upgrade to current working snapshot
References:
File last commit:
Show/Diff file:
Action:
lib/assets/Lib/_markupbase.py
| 395 lines
| 14.6 KiB
| text/x-python
| PythonLexer
|
r584 | """Shared support for scanning document type declarations in HTML and XHTML. | |||
This module is used as a foundation for the html.parser module. It has no | ||||
documented public API and should not be used directly. | ||||
""" | ||||
import re | ||||
_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match | ||||
_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match | ||||
_commentclose = re.compile(r'--\s*>') | ||||
_markedsectionclose = re.compile(r']\s*]\s*>') | ||||
# An analysis of the MS-Word extensions is available at | ||||
# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf | ||||
_msmarkedsectionclose = re.compile(r']\s*>') | ||||
del re | ||||
class ParserBase: | ||||
"""Parser base class which provides some common support methods used | ||||
by the SGML/HTML and XHTML parsers.""" | ||||
def __init__(self): | ||||
if self.__class__ is ParserBase: | ||||
raise RuntimeError( | ||||
"_markupbase.ParserBase must be subclassed") | ||||
def error(self, message): | ||||
raise NotImplementedError( | ||||
"subclasses of ParserBase must override error()") | ||||
def reset(self): | ||||
self.lineno = 1 | ||||
self.offset = 0 | ||||
def getpos(self): | ||||
"""Return current line number and offset.""" | ||||
return self.lineno, self.offset | ||||
# Internal -- update line number and offset. This should be | ||||
# called for each piece of data exactly once, in order -- in other | ||||
# words the concatenation of all the input strings to this | ||||
# function should be exactly the entire input. | ||||
def updatepos(self, i, j): | ||||
if i >= j: | ||||
return j | ||||
rawdata = self.rawdata | ||||
nlines = rawdata.count("\n", i, j) | ||||
if nlines: | ||||
self.lineno = self.lineno + nlines | ||||
pos = rawdata.rindex("\n", i, j) # Should not fail | ||||
self.offset = j-(pos+1) | ||||
else: | ||||
self.offset = self.offset + j-i | ||||
return j | ||||
_decl_otherchars = '' | ||||
# Internal -- parse declaration (for use by subclasses). | ||||
def parse_declaration(self, i): | ||||
# This is some sort of declaration; in "HTML as | ||||
# deployed," this should only be the document type | ||||
# declaration ("<!DOCTYPE html...>"). | ||||
# ISO 8879:1986, however, has more complex | ||||
# declaration syntax for elements in <!...>, including: | ||||
# --comment-- | ||||
# [marked section] | ||||
# name in the following list: ENTITY, DOCTYPE, ELEMENT, | ||||
# ATTLIST, NOTATION, SHORTREF, USEMAP, | ||||
# LINKTYPE, LINK, IDLINK, USELINK, SYSTEM | ||||
rawdata = self.rawdata | ||||
j = i + 2 | ||||
assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" | ||||
if rawdata[j:j+1] == ">": | ||||
# the empty comment <!> | ||||
return j + 1 | ||||
if rawdata[j:j+1] in ("-", ""): | ||||
# Start of comment followed by buffer boundary, | ||||
# or just a buffer boundary. | ||||
return -1 | ||||
# A simple, practical version could look like: ((name|stringlit) S*) + '>' | ||||
n = len(rawdata) | ||||
if rawdata[j:j+2] == '--': #comment | ||||
# Locate --.*-- as the body of the comment | ||||
return self.parse_comment(i) | ||||
elif rawdata[j] == '[': #marked section | ||||
# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section | ||||
# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA | ||||
# Note that this is extended by Microsoft Office "Save as Web" function | ||||
# to include [if...] and [endif]. | ||||
return self.parse_marked_section(i) | ||||
else: #all other declaration elements | ||||
decltype, j = self._scan_name(j, i) | ||||
if j < 0: | ||||
return j | ||||
if decltype == "doctype": | ||||
self._decl_otherchars = '' | ||||
while j < n: | ||||
c = rawdata[j] | ||||
if c == ">": | ||||
# end of declaration syntax | ||||
data = rawdata[i+2:j] | ||||
if decltype == "doctype": | ||||
self.handle_decl(data) | ||||
else: | ||||
# According to the HTML5 specs sections "8.2.4.44 Bogus | ||||
# comment state" and "8.2.4.45 Markup declaration open | ||||
# state", a comment token should be emitted. | ||||
# Calling unknown_decl provides more flexibility though. | ||||
self.unknown_decl(data) | ||||
return j + 1 | ||||
if c in "\"'": | ||||
m = _declstringlit_match(rawdata, j) | ||||
if not m: | ||||
return -1 # incomplete | ||||
j = m.end() | ||||
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": | ||||
name, j = self._scan_name(j, i) | ||||
elif c in self._decl_otherchars: | ||||
j = j + 1 | ||||
elif c == "[": | ||||
# this could be handled in a separate doctype parser | ||||
if decltype == "doctype": | ||||
j = self._parse_doctype_subset(j + 1, i) | ||||
elif decltype in {"attlist", "linktype", "link", "element"}: | ||||
# must tolerate []'d groups in a content model in an element declaration | ||||
# also in data attribute specifications of attlist declaration | ||||
# also link type declaration subsets in linktype declarations | ||||
# also link attribute specification lists in link declarations | ||||
self.error("unsupported '[' char in %s declaration" % decltype) | ||||
else: | ||||
self.error("unexpected '[' char in declaration") | ||||
else: | ||||
self.error( | ||||
"unexpected %r char in declaration" % rawdata[j]) | ||||
if j < 0: | ||||
return j | ||||
return -1 # incomplete | ||||
# Internal -- parse a marked section | ||||
# Override this to handle MS-word extension syntax <![if word]>content<![endif]> | ||||
def parse_marked_section(self, i, report=1): | ||||
rawdata= self.rawdata | ||||
assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" | ||||
sectName, j = self._scan_name( i+3, i ) | ||||
if j < 0: | ||||
return j | ||||
if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}: | ||||
# look for standard ]]> ending | ||||
match= _markedsectionclose.search(rawdata, i+3) | ||||
elif sectName in {"if", "else", "endif"}: | ||||
# look for MS Office ]> ending | ||||
match= _msmarkedsectionclose.search(rawdata, i+3) | ||||
else: | ||||
self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) | ||||
if not match: | ||||
return -1 | ||||
if report: | ||||
j = match.start(0) | ||||
self.unknown_decl(rawdata[i+3: j]) | ||||
return match.end(0) | ||||
# Internal -- parse comment, return length or -1 if not terminated | ||||
def parse_comment(self, i, report=1): | ||||
rawdata = self.rawdata | ||||
if rawdata[i:i+4] != '<!--': | ||||
self.error('unexpected call to parse_comment()') | ||||
match = _commentclose.search(rawdata, i+4) | ||||
if not match: | ||||
return -1 | ||||
if report: | ||||
j = match.start(0) | ||||
self.handle_comment(rawdata[i+4: j]) | ||||
return match.end(0) | ||||
# Internal -- scan past the internal subset in a <!DOCTYPE declaration, | ||||
# returning the index just past any whitespace following the trailing ']'. | ||||
def _parse_doctype_subset(self, i, declstartpos): | ||||
rawdata = self.rawdata | ||||
n = len(rawdata) | ||||
j = i | ||||
while j < n: | ||||
c = rawdata[j] | ||||
if c == "<": | ||||
s = rawdata[j:j+2] | ||||
if s == "<": | ||||
# end of buffer; incomplete | ||||
return -1 | ||||
if s != "<!": | ||||
self.updatepos(declstartpos, j + 1) | ||||
self.error("unexpected char in internal subset (in %r)" % s) | ||||
if (j + 2) == n: | ||||
# end of buffer; incomplete | ||||
return -1 | ||||
if (j + 4) > n: | ||||
# end of buffer; incomplete | ||||
return -1 | ||||
if rawdata[j:j+4] == "<!--": | ||||
j = self.parse_comment(j, report=0) | ||||
if j < 0: | ||||
return j | ||||
continue | ||||
name, j = self._scan_name(j + 2, declstartpos) | ||||
if j == -1: | ||||
return -1 | ||||
if name not in {"attlist", "element", "entity", "notation"}: | ||||
self.updatepos(declstartpos, j + 2) | ||||
self.error( | ||||
"unknown declaration %r in internal subset" % name) | ||||
# handle the individual names | ||||
meth = getattr(self, "_parse_doctype_" + name) | ||||
j = meth(j, declstartpos) | ||||
if j < 0: | ||||
return j | ||||
elif c == "%": | ||||
# parameter entity reference | ||||
if (j + 1) == n: | ||||
# end of buffer; incomplete | ||||
return -1 | ||||
s, j = self._scan_name(j + 1, declstartpos) | ||||
if j < 0: | ||||
return j | ||||
if rawdata[j] == ";": | ||||
j = j + 1 | ||||
elif c == "]": | ||||
j = j + 1 | ||||
while j < n and rawdata[j].isspace(): | ||||
j = j + 1 | ||||
if j < n: | ||||
if rawdata[j] == ">": | ||||
return j | ||||
self.updatepos(declstartpos, j) | ||||
self.error("unexpected char after internal subset") | ||||
else: | ||||
return -1 | ||||
elif c.isspace(): | ||||
j = j + 1 | ||||
else: | ||||
self.updatepos(declstartpos, j) | ||||
self.error("unexpected char %r in internal subset" % c) | ||||
# end of buffer reached | ||||
return -1 | ||||
# Internal -- scan past <!ELEMENT declarations | ||||
def _parse_doctype_element(self, i, declstartpos): | ||||
name, j = self._scan_name(i, declstartpos) | ||||
if j == -1: | ||||
return -1 | ||||
# style content model; just skip until '>' | ||||
rawdata = self.rawdata | ||||
if '>' in rawdata[j:]: | ||||
return rawdata.find(">", j) + 1 | ||||
return -1 | ||||
# Internal -- scan past <!ATTLIST declarations | ||||
def _parse_doctype_attlist(self, i, declstartpos): | ||||
rawdata = self.rawdata | ||||
name, j = self._scan_name(i, declstartpos) | ||||
c = rawdata[j:j+1] | ||||
if c == "": | ||||
return -1 | ||||
if c == ">": | ||||
return j + 1 | ||||
while 1: | ||||
# scan a series of attribute descriptions; simplified: | ||||
# name type [value] [#constraint] | ||||
name, j = self._scan_name(j, declstartpos) | ||||
if j < 0: | ||||
return j | ||||
c = rawdata[j:j+1] | ||||
if c == "": | ||||
return -1 | ||||
if c == "(": | ||||
# an enumerated type; look for ')' | ||||
if ")" in rawdata[j:]: | ||||
j = rawdata.find(")", j) + 1 | ||||
else: | ||||
return -1 | ||||
while rawdata[j:j+1].isspace(): | ||||
j = j + 1 | ||||
if not rawdata[j:]: | ||||
# end of buffer, incomplete | ||||
return -1 | ||||
else: | ||||
name, j = self._scan_name(j, declstartpos) | ||||
c = rawdata[j:j+1] | ||||
if not c: | ||||
return -1 | ||||
if c in "'\"": | ||||
m = _declstringlit_match(rawdata, j) | ||||
if m: | ||||
j = m.end() | ||||
else: | ||||
return -1 | ||||
c = rawdata[j:j+1] | ||||
if not c: | ||||
return -1 | ||||
if c == "#": | ||||
if rawdata[j:] == "#": | ||||
# end of buffer | ||||
return -1 | ||||
name, j = self._scan_name(j + 1, declstartpos) | ||||
if j < 0: | ||||
return j | ||||
c = rawdata[j:j+1] | ||||
if not c: | ||||
return -1 | ||||
if c == '>': | ||||
# all done | ||||
return j + 1 | ||||
# Internal -- scan past <!NOTATION declarations | ||||
def _parse_doctype_notation(self, i, declstartpos): | ||||
name, j = self._scan_name(i, declstartpos) | ||||
if j < 0: | ||||
return j | ||||
rawdata = self.rawdata | ||||
while 1: | ||||
c = rawdata[j:j+1] | ||||
if not c: | ||||
# end of buffer; incomplete | ||||
return -1 | ||||
if c == '>': | ||||
return j + 1 | ||||
if c in "'\"": | ||||
m = _declstringlit_match(rawdata, j) | ||||
if not m: | ||||
return -1 | ||||
j = m.end() | ||||
else: | ||||
name, j = self._scan_name(j, declstartpos) | ||||
if j < 0: | ||||
return j | ||||
# Internal -- scan past <!ENTITY declarations | ||||
def _parse_doctype_entity(self, i, declstartpos): | ||||
rawdata = self.rawdata | ||||
if rawdata[i:i+1] == "%": | ||||
j = i + 1 | ||||
while 1: | ||||
c = rawdata[j:j+1] | ||||
if not c: | ||||
return -1 | ||||
if c.isspace(): | ||||
j = j + 1 | ||||
else: | ||||
break | ||||
else: | ||||
j = i | ||||
name, j = self._scan_name(j, declstartpos) | ||||
if j < 0: | ||||
return j | ||||
while 1: | ||||
c = self.rawdata[j:j+1] | ||||
if not c: | ||||
return -1 | ||||
if c in "'\"": | ||||
m = _declstringlit_match(rawdata, j) | ||||
if m: | ||||
j = m.end() | ||||
else: | ||||
return -1 # incomplete | ||||
elif c == ">": | ||||
return j + 1 | ||||
else: | ||||
name, j = self._scan_name(j, declstartpos) | ||||
if j < 0: | ||||
return j | ||||
# Internal -- scan a name token and the new position and the token, or | ||||
# return -1 if we've reached the end of the buffer. | ||||
def _scan_name(self, i, declstartpos): | ||||
rawdata = self.rawdata | ||||
n = len(rawdata) | ||||
if i == n: | ||||
return None, -1 | ||||
m = _declname_match(rawdata, i) | ||||
if m: | ||||
s = m.group() | ||||
name = s.strip() | ||||
if (i + len(s)) == n: | ||||
return None, -1 # end of buffer | ||||
return name.lower(), m.end() | ||||
else: | ||||
self.updatepos(declstartpos, i) | ||||
self.error("expected name token at %r" | ||||
% rawdata[declstartpos:declstartpos+20]) | ||||
# To be overridden -- handlers for unknown objects | ||||
def unknown_decl(self, data): | ||||
pass | ||||