cafe-grader-web Files · lib/assets/Lib/html/parser.py

cafe-grader-web

Location: cafe-grader-web/lib/assets/Lib/html/parser.py - annotation

Commit Description:

Merge pull request #17 from nattee/master...

Commit Description:

Merge pull request #17 from nattee/master upgrade to current working snapshot

References:

r715:88baf2c40c51 default

File last commit:

r584:056392c00e67 java

Show/Diff file:

Action:

            
                    lib/assets/Lib/html/parser.py
                
             | 532 lines
             | 19.7 KiB
             | text/x-python 
             | PythonLexer
             | 
      
            History
        
           Show Full History
         |
          Source
         | Raw
         |
              
              Download
              
     admin
  
merge with algo and add brython files that were missing

              r584
            
      """A parser for HTML and XHTML."""

      # This file is based on sgmllib.py, but the API is slightly different.

      # XXX There should be a way to distinguish between PCDATA (parsed

      # character data -- the normal case), RCDATA (replaceable character

      # data -- only char and entity references and end tags are special)

      # and CDATA (character data -- only end tags are special).

      import _markupbase

      import re

      import warnings

      # Regular expressions used for parsing

      interesting_normal = re.compile('[&<]')

      incomplete = re.compile('&[a-zA-Z#]')

      entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')

      charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')

      starttagopen = re.compile('<[a-zA-Z]')

      piclose = re.compile('>')

      commentclose = re.compile(r'--\s*>')

      tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')

      # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state

      # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state

      tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')

      # Note:

      #  1) the strict attrfind isn't really strict, but we can't make it

      #     correctly strict without breaking backward compatibility;

      #  2) if you change attrfind remember to update locatestarttagend too;

      #  3) if you change attrfind and/or locatestarttagend the parser will

      #     explode, so don't do it.

      attrfind = re.compile(

          r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'

          r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')

      attrfind_tolerant = re.compile(

          r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'

          r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')

      locatestarttagend = re.compile(r"""

        <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name

        (?:\s+                             # whitespace before attribute name

          (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name

            (?:\s*=\s*                     # value indicator

              (?:'[^']*'                   # LITA-enclosed value

                |\"[^\"]*\"                # LIT-enclosed value

                |[^'\">\s]+                # bare value

               )

             )?

           )

         )*

        \s*                                # trailing whitespace

      """, re.VERBOSE)

      locatestarttagend_tolerant = re.compile(r"""

        <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name

        (?:[\s/]*                          # optional whitespace before attribute name

          (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name

            (?:\s*=+\s*                    # value indicator

              (?:'[^']*'                   # LITA-enclosed value

                |"[^"]*"                   # LIT-enclosed value

                |(?!['"])[^>\s]*           # bare value

               )

               (?:\s*,)*                   # possibly followed by a comma

             )?(?:\s|/(?!>))*

           )*

         )?

        \s*                                # trailing whitespace

      """, re.VERBOSE)

      endendtag = re.compile('>')

      # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between

      # </ and the tag name, so maybe this should be fixed

      endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')

      class HTMLParseError(Exception):

          """Exception raised for all parse errors."""

          def __init__(self, msg, position=(None, None)):

              assert msg

              self.msg = msg

              self.lineno = position[0]

              self.offset = position[1]

          def __str__(self):

              result = self.msg

              if self.lineno is not None:

                  result = result + ", at line %d" % self.lineno

              if self.offset is not None:

                  result = result + ", column %d" % (self.offset + 1)

              return result

      class HTMLParser(_markupbase.ParserBase):

          """Find tags and other markup and call handler functions.

          Usage:

              p = HTMLParser()

              p.feed(data)

              ...

              p.close()

          Start tags are handled by calling self.handle_starttag() or

          self.handle_startendtag(); end tags by self.handle_endtag().  The

          data between tags is passed from the parser to the derived class

          by calling self.handle_data() with the data as argument (the data

          may be split up in arbitrary chunks).  Entity references are

          passed by calling self.handle_entityref() with the entity

          reference as the argument.  Numeric character references are

          passed to self.handle_charref() with the string containing the

          reference as the argument.

          """

          CDATA_CONTENT_ELEMENTS = ("script", "style")

          def __init__(self, strict=False):

              """Initialize and reset this instance.

              If strict is set to False (the default) the parser will parse invalid

              markup, otherwise it will raise an error.  Note that the strict mode

              is deprecated.

              """

              if strict:

                  warnings.warn("The strict mode is deprecated.",

                                DeprecationWarning, stacklevel=2)

              self.strict = strict

              self.reset()

          def reset(self):

              """Reset this instance.  Loses all unprocessed data."""

              self.rawdata = ''

              self.lasttag = '???'

              self.interesting = interesting_normal

              self.cdata_elem = None

              _markupbase.ParserBase.reset(self)

          def feed(self, data):

              r"""Feed data to the parser.

              Call this as often as you want, with as little or as much text

              as you want (may include '\n').

              """

              self.rawdata = self.rawdata + data

              self.goahead(0)

          def close(self):

              """Handle any buffered data."""

              self.goahead(1)

          def error(self, message):

              raise HTMLParseError(message, self.getpos())

          __starttag_text = None

          def get_starttag_text(self):

              """Return full source of start tag: '<...>'."""

              return self.__starttag_text

          def set_cdata_mode(self, elem):

              self.cdata_elem = elem.lower()

              self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)

          def clear_cdata_mode(self):

              self.interesting = interesting_normal

              self.cdata_elem = None

          # Internal -- handle data as far as reasonable.  May leave state

          # and data to be processed by a subsequent call.  If 'end' is

          # true, force handling all data as if followed by EOF marker.

          def goahead(self, end):

              rawdata = self.rawdata

              i = 0

              n = len(rawdata)

              while i < n:

                  match = self.interesting.search(rawdata, i) # < or &

                  if match:

                      j = match.start()

                  else:

                      if self.cdata_elem:

                          break

                      j = n

                  if i < j: self.handle_data(rawdata[i:j])

                  i = self.updatepos(i, j)

                  if i == n: break

                  startswith = rawdata.startswith

                  if startswith('<', i):

                      if starttagopen.match(rawdata, i): # < + letter

                          k = self.parse_starttag(i)

                      elif startswith("</", i):

                          k = self.parse_endtag(i)

                      elif startswith("<!--", i):

                          k = self.parse_comment(i)

                      elif startswith("<?", i):

                          k = self.parse_pi(i)

                      elif startswith("<!", i):

                          if self.strict:

                              k = self.parse_declaration(i)

                          else:

                              k = self.parse_html_declaration(i)

                      elif (i + 1) < n:

                          self.handle_data("<")

                          k = i + 1

                      else:

                          break

                      if k < 0:

                          if not end:

                              break

                          if self.strict:

                              self.error("EOF in middle of construct")

                          k = rawdata.find('>', i + 1)

                          if k < 0:

                              k = rawdata.find('<', i + 1)

                              if k < 0:

                                  k = i + 1

                          else:

                              k += 1

                          self.handle_data(rawdata[i:k])

                      i = self.updatepos(i, k)

                  elif startswith("&#", i):

                      match = charref.match(rawdata, i)

                      if match:

                          name = match.group()[2:-1]

                          self.handle_charref(name)

                          k = match.end()

                          if not startswith(';', k-1):

                              k = k - 1

                          i = self.updatepos(i, k)

                          continue

                      else:

                          if ";" in rawdata[i:]: #bail by consuming &#

                              self.handle_data(rawdata[0:2])

                              i = self.updatepos(i, 2)

                          break

                  elif startswith('&', i):

                      match = entityref.match(rawdata, i)

                      if match:

                          name = match.group(1)

                          self.handle_entityref(name)

                          k = match.end()

                          if not startswith(';', k-1):

                              k = k - 1

                          i = self.updatepos(i, k)

                          continue

                      match = incomplete.match(rawdata, i)

                      if match:

                          # match.group() will contain at least 2 chars

                          if end and match.group() == rawdata[i:]:

                              if self.strict:

                                  self.error("EOF in middle of entity or char ref")

                              else:

                                  k = match.end()

                                  if k <= i:

                                      k = n

                                  i = self.updatepos(i, i + 1)

                          # incomplete

                          break

                      elif (i + 1) < n:

                          # not the end of the buffer, and can't be confused

                          # with some other construct

                          self.handle_data("&")

                          i = self.updatepos(i, i + 1)

                      else:

                          break

                  else:

                      assert 0, "interesting.search() lied"

              # end while

              if end and i < n and not self.cdata_elem:

                  self.handle_data(rawdata[i:n])

                  i = self.updatepos(i, n)

              self.rawdata = rawdata[i:]

          # Internal -- parse html declarations, return length or -1 if not terminated

          # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state

          # See also parse_declaration in _markupbase

          def parse_html_declaration(self, i):

              rawdata = self.rawdata

              assert rawdata[i:i+2] == '<!', ('unexpected call to '

                                              'parse_html_declaration()')

              if rawdata[i:i+4] == '<!--':

                  # this case is actually already handled in goahead()

                  return self.parse_comment(i)

              elif rawdata[i:i+3] == '<![':

                  return self.parse_marked_section(i)

              elif rawdata[i:i+9].lower() == '<!doctype':

                  # find the closing >

                  gtpos = rawdata.find('>', i+9)

                  if gtpos == -1:

                      return -1

                  self.handle_decl(rawdata[i+2:gtpos])

                  return gtpos+1

              else:

                  return self.parse_bogus_comment(i)

          # Internal -- parse bogus comment, return length or -1 if not terminated

          # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state

          def parse_bogus_comment(self, i, report=1):

              rawdata = self.rawdata

              assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '

                                                      'parse_comment()')

              pos = rawdata.find('>', i+2)

              if pos == -1:

                  return -1

              if report:

                  self.handle_comment(rawdata[i+2:pos])

              return pos + 1

          # Internal -- parse processing instr, return end or -1 if not terminated

          def parse_pi(self, i):

              rawdata = self.rawdata

              assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'

              match = piclose.search(rawdata, i+2) # >

              if not match:

                  return -1

              j = match.start()

              self.handle_pi(rawdata[i+2: j])

              j = match.end()

              return j

          # Internal -- handle starttag, return end or -1 if not terminated

          def parse_starttag(self, i):

              self.__starttag_text = None

              endpos = self.check_for_whole_start_tag(i)

              if endpos < 0:

                  return endpos

              rawdata = self.rawdata

              self.__starttag_text = rawdata[i:endpos]

              # Now parse the data between i+1 and j into a tag and attrs

              attrs = []

              match = tagfind.match(rawdata, i+1)

              assert match, 'unexpected call to parse_starttag()'

              k = match.end()

              self.lasttag = tag = match.group(1).lower()

              while k < endpos:

                  if self.strict:

                      m = attrfind.match(rawdata, k)

                  else:

                      m = attrfind_tolerant.match(rawdata, k)

                  if not m:

                      break

                  attrname, rest, attrvalue = m.group(1, 2, 3)

                  if not rest:

                      attrvalue = None

                  elif attrvalue[:1] == '\'' == attrvalue[-1:] or \

                       attrvalue[:1] == '"' == attrvalue[-1:]:

                      attrvalue = attrvalue[1:-1]

                  if attrvalue:

                      attrvalue = self.unescape(attrvalue)

                  attrs.append((attrname.lower(), attrvalue))

                  k = m.end()

              end = rawdata[k:endpos].strip()

              if end not in (">", "/>"):

                  lineno, offset = self.getpos()

                  if "\n" in self.__starttag_text:

                      lineno = lineno + self.__starttag_text.count("\n")

                      offset = len(self.__starttag_text) \

                               - self.__starttag_text.rfind("\n")

                  else:

                      offset = offset + len(self.__starttag_text)

                  if self.strict:

                      self.error("junk characters in start tag: %r"

                                 % (rawdata[k:endpos][:20],))

                  self.handle_data(rawdata[i:endpos])

                  return endpos

              if end.endswith('/>'):

                  # XHTML-style empty tag: <span attr="value" />

                  self.handle_startendtag(tag, attrs)

              else:

                  self.handle_starttag(tag, attrs)

                  if tag in self.CDATA_CONTENT_ELEMENTS:

                      self.set_cdata_mode(tag)

              return endpos

          # Internal -- check to see if we have a complete starttag; return end

          # or -1 if incomplete.

          def check_for_whole_start_tag(self, i):

              rawdata = self.rawdata

              if self.strict:

                  m = locatestarttagend.match(rawdata, i)

              else:

                  m = locatestarttagend_tolerant.match(rawdata, i)

              if m:

                  j = m.end()

                  next = rawdata[j:j+1]

                  if next == ">":

                      return j + 1

                  if next == "/":

                      if rawdata.startswith("/>", j):

                          return j + 2

                      if rawdata.startswith("/", j):

                          # buffer boundary

                          return -1

                      # else bogus input

                      if self.strict:

                          self.updatepos(i, j + 1)

                          self.error("malformed empty start tag")

                      if j > i:

                          return j

                      else:

                          return i + 1

                  if next == "":

                      # end of input

                      return -1

                  if next in ("abcdefghijklmnopqrstuvwxyz=/"

                              "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):

                      # end of input in or before attribute value, or we have the

                      # '/' from a '/>' ending

                      return -1

                  if self.strict:

                      self.updatepos(i, j)

                      self.error("malformed start tag")

                  if j > i:

                      return j

                  else:

                      return i + 1

              raise AssertionError("we should not get here!")

          # Internal -- parse endtag, return end or -1 if incomplete

          def parse_endtag(self, i):

              rawdata = self.rawdata

              assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"

              match = endendtag.search(rawdata, i+1) # >

              if not match:

                  return -1

              gtpos = match.end()

              match = endtagfind.match(rawdata, i) # </ + tag + >

              if not match:

                  if self.cdata_elem is not None:

                      self.handle_data(rawdata[i:gtpos])

                      return gtpos

                  if self.strict:

                      self.error("bad end tag: %r" % (rawdata[i:gtpos],))

                  # find the name: w3.org/TR/html5/tokenization.html#tag-name-state

                  namematch = tagfind_tolerant.match(rawdata, i+2)

                  if not namematch:

                      # w3.org/TR/html5/tokenization.html#end-tag-open-state

                      if rawdata[i:i+3] == '</>':

                          return i+3

                      else:

                          return self.parse_bogus_comment(i)

                  tagname = namematch.group().lower()

                  # consume and ignore other stuff between the name and the >

                  # Note: this is not 100% correct, since we might have things like

                  # </tag attr=">">, but looking for > after tha name should cover

                  # most of the cases and is much simpler

                  gtpos = rawdata.find('>', namematch.end())

                  self.handle_endtag(tagname)

                  return gtpos+1

              elem = match.group(1).lower() # script or style

              if self.cdata_elem is not None:

                  if elem != self.cdata_elem:

                      self.handle_data(rawdata[i:gtpos])

                      return gtpos

              self.handle_endtag(elem.lower())

              self.clear_cdata_mode()

              return gtpos

          # Overridable -- finish processing of start+end tag: <tag.../>

          def handle_startendtag(self, tag, attrs):

              self.handle_starttag(tag, attrs)

              self.handle_endtag(tag)

          # Overridable -- handle start tag

          def handle_starttag(self, tag, attrs):

              pass

          # Overridable -- handle end tag

          def handle_endtag(self, tag):

              pass

          # Overridable -- handle character reference

          def handle_charref(self, name):

              pass

          # Overridable -- handle entity reference

          def handle_entityref(self, name):

              pass

          # Overridable -- handle data

          def handle_data(self, data):

              pass

          # Overridable -- handle comment

          def handle_comment(self, data):

              pass

          # Overridable -- handle declaration

          def handle_decl(self, decl):

              pass

          # Overridable -- handle processing instruction

          def handle_pi(self, data):

              pass

          def unknown_decl(self, data):

              if self.strict:

                  self.error("unknown declaration: %r" % (data,))

          # Internal -- helper to remove special character quoting

          def unescape(self, s):

              if '&' not in s:

                  return s

              def replaceEntities(s):

                  s = s.groups()[0]

                  try:

                      if s[0] == "#":

                          s = s[1:]

                          if s[0] in ['x','X']:

                              c = int(s[1:].rstrip(';'), 16)

                          else:

                              c = int(s.rstrip(';'))

                          return chr(c)

                  except ValueError:

                      return '&#' + s

                  else:

                      from html.entities import html5

                      if s in html5:

                          return html5[s]

                      elif s.endswith(';'):

                          return '&' + s

                      for x in range(2, len(s)):

                          if s[:x] in html5:

                              return html5[s[:x]] + s[x:]

                      else:

                          return '&' + s

              return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",

                            replaceEntities, s, flags=re.ASCII)

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository permissions settings

Sign in to your account

Last Author

admin merge with algo and add brython files that were missing	r584	"""A parser for HTML and XHTML."""

		# This file is based on sgmllib.py, but the API is slightly different.

		# XXX There should be a way to distinguish between PCDATA (parsed
		# character data -- the normal case), RCDATA (replaceable character
		# data -- only char and entity references and end tags are special)
		# and CDATA (character data -- only end tags are special).


		import _markupbase
		import re
		import warnings

		# Regular expressions used for parsing

		interesting_normal = re.compile('[&<]')
		incomplete = re.compile('&[a-zA-Z#]')

		entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
		charref = re.compile('&#(?:[0-9]+\|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')

		starttagopen = re.compile('<[a-zA-Z]')
		piclose = re.compile('>')
		commentclose = re.compile(r'--\s*>')
		tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_])(?:\s\|/(?!>))')
		# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
		# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
		tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
		# Note:
		# 1) the strict attrfind isn't really strict, but we can't make it
		# correctly strict without breaking backward compatibility;
		# 2) if you change attrfind remember to update locatestarttagend too;
		# 3) if you change attrfind and/or locatestarttagend the parser will
		# explode, so don't do it.
		attrfind = re.compile(
		r'\s([a-zA-Z_][-.:a-zA-Z_0-9])(\s=\s'
		r'(\'[^\']\'\|"[^"]"\|[^\s"\'=<>`]*))?')
		attrfind_tolerant = re.compile(
		r'((?<=[\'"\s/])[^\s/>][^\s/=>])(\s=+\s*'
		r'(\'[^\']\'\|"[^"]"\|(?![\'"])[^>\s]))?(?:\s\|/(?!>))')
		locatestarttagend = re.compile(r"""
		<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
		(?:\s+ # whitespace before attribute name
		(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
		(?:\s=\s # value indicator
		(?:'[^']*' # LITA-enclosed value
		\|\"[^\"]*\" # LIT-enclosed value
		\|[^'\">\s]+ # bare value
		)
		)?
		)
		)*
		\s* # trailing whitespace
		""", re.VERBOSE)
		locatestarttagend_tolerant = re.compile(r"""
		<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
		(?:[\s/]* # optional whitespace before attribute name
		(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
		(?:\s=+\s # value indicator
		(?:'[^']*' # LITA-enclosed value
		\|"[^"]*" # LIT-enclosed value
		\|(?!['"])[^>\s]* # bare value
		)
		(?:\s,) # possibly followed by a comma
		)?(?:\s\|/(?!>))*
		)*
		)?
		\s* # trailing whitespace
		""", re.VERBOSE)
		endendtag = re.compile('>')
		# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
		# </ and the tag name, so maybe this should be fixed
		endtagfind = re.compile('</\s([a-zA-Z][-.a-zA-Z0-9:_])\s*>')


		class HTMLParseError(Exception):
		"""Exception raised for all parse errors."""

		def __init__(self, msg, position=(None, None)):
		assert msg
		self.msg = msg
		self.lineno = position[0]
		self.offset = position[1]

		def __str__(self):
		result = self.msg
		if self.lineno is not None:
		result = result + ", at line %d" % self.lineno
		if self.offset is not None:
		result = result + ", column %d" % (self.offset + 1)
		return result


		class HTMLParser(_markupbase.ParserBase):
		"""Find tags and other markup and call handler functions.

		Usage:
		p = HTMLParser()
		p.feed(data)
		...
		p.close()

		Start tags are handled by calling self.handle_starttag() or
		self.handle_startendtag(); end tags by self.handle_endtag(). The
		data between tags is passed from the parser to the derived class
		by calling self.handle_data() with the data as argument (the data
		may be split up in arbitrary chunks). Entity references are
		passed by calling self.handle_entityref() with the entity
		reference as the argument. Numeric character references are
		passed to self.handle_charref() with the string containing the
		reference as the argument.
		"""

		CDATA_CONTENT_ELEMENTS = ("script", "style")

		def __init__(self, strict=False):
		"""Initialize and reset this instance.

		If strict is set to False (the default) the parser will parse invalid
		markup, otherwise it will raise an error. Note that the strict mode
		is deprecated.
		"""
		if strict:
		warnings.warn("The strict mode is deprecated.",
		DeprecationWarning, stacklevel=2)
		self.strict = strict
		self.reset()

		def reset(self):
		"""Reset this instance. Loses all unprocessed data."""
		self.rawdata = ''
		self.lasttag = '???'
		self.interesting = interesting_normal
		self.cdata_elem = None
		_markupbase.ParserBase.reset(self)

		def feed(self, data):
		r"""Feed data to the parser.

		Call this as often as you want, with as little or as much text
		as you want (may include '\n').
		"""
		self.rawdata = self.rawdata + data
		self.goahead(0)

		def close(self):
		"""Handle any buffered data."""
		self.goahead(1)

		def error(self, message):
		raise HTMLParseError(message, self.getpos())

		__starttag_text = None

		def get_starttag_text(self):
		"""Return full source of start tag: '<...>'."""
		return self.__starttag_text

		def set_cdata_mode(self, elem):
		self.cdata_elem = elem.lower()
		self.interesting = re.compile(r'</\s%s\s>' % self.cdata_elem, re.I)

		def clear_cdata_mode(self):
		self.interesting = interesting_normal
		self.cdata_elem = None

		# Internal -- handle data as far as reasonable. May leave state
		# and data to be processed by a subsequent call. If 'end' is
		# true, force handling all data as if followed by EOF marker.
		def goahead(self, end):
		rawdata = self.rawdata
		i = 0
		n = len(rawdata)
		while i < n:
		match = self.interesting.search(rawdata, i) # < or &
		if match:
		j = match.start()
		else:
		if self.cdata_elem:
		break
		j = n
		if i < j: self.handle_data(rawdata[i:j])
		i = self.updatepos(i, j)
		if i == n: break
		startswith = rawdata.startswith
		if startswith('<', i):
		if starttagopen.match(rawdata, i): # < + letter
		k = self.parse_starttag(i)
		elif startswith("</", i):
		k = self.parse_endtag(i)
		elif startswith("<!--", i):
		k = self.parse_comment(i)
		elif startswith("<?", i):
		k = self.parse_pi(i)
		elif startswith("<!", i):
		if self.strict:
		k = self.parse_declaration(i)
		else:
		k = self.parse_html_declaration(i)
		elif (i + 1) < n:
		self.handle_data("<")
		k = i + 1
		else:
		break
		if k < 0:
		if not end:
		break
		if self.strict:
		self.error("EOF in middle of construct")
		k = rawdata.find('>', i + 1)
		if k < 0:
		k = rawdata.find('<', i + 1)
		if k < 0:
		k = i + 1
		else:
		k += 1
		self.handle_data(rawdata[i:k])
		i = self.updatepos(i, k)
		elif startswith("&#", i):
		match = charref.match(rawdata, i)
		if match:
		name = match.group()[2:-1]
		self.handle_charref(name)
		k = match.end()
		if not startswith(';', k-1):
		k = k - 1
		i = self.updatepos(i, k)
		continue
		else:
		if ";" in rawdata[i:]: #bail by consuming &#
		self.handle_data(rawdata[0:2])
		i = self.updatepos(i, 2)
		break
		elif startswith('&', i):
		match = entityref.match(rawdata, i)
		if match:
		name = match.group(1)
		self.handle_entityref(name)
		k = match.end()
		if not startswith(';', k-1):
		k = k - 1
		i = self.updatepos(i, k)
		continue
		match = incomplete.match(rawdata, i)
		if match:
		# match.group() will contain at least 2 chars
		if end and match.group() == rawdata[i:]:
		if self.strict:
		self.error("EOF in middle of entity or char ref")
		else:
		k = match.end()
		if k <= i:
		k = n
		i = self.updatepos(i, i + 1)
		# incomplete
		break
		elif (i + 1) < n:
		# not the end of the buffer, and can't be confused
		# with some other construct
		self.handle_data("&")
		i = self.updatepos(i, i + 1)
		else:
		break
		else:
		assert 0, "interesting.search() lied"
		# end while
		if end and i < n and not self.cdata_elem:
		self.handle_data(rawdata[i:n])
		i = self.updatepos(i, n)
		self.rawdata = rawdata[i:]

		# Internal -- parse html declarations, return length or -1 if not terminated
		# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
		# See also parse_declaration in _markupbase
		def parse_html_declaration(self, i):
		rawdata = self.rawdata
		assert rawdata[i:i+2] == '<!', ('unexpected call to '
		'parse_html_declaration()')
		if rawdata[i:i+4] == '<!--':
		# this case is actually already handled in goahead()
		return self.parse_comment(i)
		elif rawdata[i:i+3] == '<![':
		return self.parse_marked_section(i)
		elif rawdata[i:i+9].lower() == '<!doctype':
		# find the closing >
		gtpos = rawdata.find('>', i+9)
		if gtpos == -1:
		return -1
		self.handle_decl(rawdata[i+2:gtpos])
		return gtpos+1
		else:
		return self.parse_bogus_comment(i)

		# Internal -- parse bogus comment, return length or -1 if not terminated
		# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
		def parse_bogus_comment(self, i, report=1):
		rawdata = self.rawdata
		assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
		'parse_comment()')
		pos = rawdata.find('>', i+2)
		if pos == -1:
		return -1
		if report:
		self.handle_comment(rawdata[i+2:pos])
		return pos + 1

		# Internal -- parse processing instr, return end or -1 if not terminated
		def parse_pi(self, i):
		rawdata = self.rawdata
		assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
		match = piclose.search(rawdata, i+2) # >
		if not match:
		return -1
		j = match.start()
		self.handle_pi(rawdata[i+2: j])
		j = match.end()
		return j

		# Internal -- handle starttag, return end or -1 if not terminated
		def parse_starttag(self, i):
		self.__starttag_text = None
		endpos = self.check_for_whole_start_tag(i)
		if endpos < 0:
		return endpos
		rawdata = self.rawdata
		self.__starttag_text = rawdata[i:endpos]

		# Now parse the data between i+1 and j into a tag and attrs
		attrs = []
		match = tagfind.match(rawdata, i+1)
		assert match, 'unexpected call to parse_starttag()'
		k = match.end()
		self.lasttag = tag = match.group(1).lower()
		while k < endpos:
		if self.strict:
		m = attrfind.match(rawdata, k)
		else:
		m = attrfind_tolerant.match(rawdata, k)
		if not m:
		break
		attrname, rest, attrvalue = m.group(1, 2, 3)
		if not rest:
		attrvalue = None
		elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
		attrvalue[:1] == '"' == attrvalue[-1:]:
		attrvalue = attrvalue[1:-1]
		if attrvalue:
		attrvalue = self.unescape(attrvalue)
		attrs.append((attrname.lower(), attrvalue))
		k = m.end()

		end = rawdata[k:endpos].strip()
		if end not in (">", "/>"):
		lineno, offset = self.getpos()
		if "\n" in self.__starttag_text:
		lineno = lineno + self.__starttag_text.count("\n")
		offset = len(self.__starttag_text) \
		- self.__starttag_text.rfind("\n")
		else:
		offset = offset + len(self.__starttag_text)
		if self.strict:
		self.error("junk characters in start tag: %r"
		% (rawdata[k:endpos][:20],))
		self.handle_data(rawdata[i:endpos])
		return endpos
		if end.endswith('/>'):
		# XHTML-style empty tag: <span attr="value" />
		self.handle_startendtag(tag, attrs)
		else:
		self.handle_starttag(tag, attrs)
		if tag in self.CDATA_CONTENT_ELEMENTS:
		self.set_cdata_mode(tag)
		return endpos

		# Internal -- check to see if we have a complete starttag; return end
		# or -1 if incomplete.
		def check_for_whole_start_tag(self, i):
		rawdata = self.rawdata
		if self.strict:
		m = locatestarttagend.match(rawdata, i)
		else:
		m = locatestarttagend_tolerant.match(rawdata, i)
		if m:
		j = m.end()
		next = rawdata[j:j+1]
		if next == ">":
		return j + 1
		if next == "/":
		if rawdata.startswith("/>", j):
		return j + 2
		if rawdata.startswith("/", j):
		# buffer boundary
		return -1
		# else bogus input
		if self.strict:
		self.updatepos(i, j + 1)
		self.error("malformed empty start tag")
		if j > i:
		return j
		else:
		return i + 1
		if next == "":
		# end of input
		return -1
		if next in ("abcdefghijklmnopqrstuvwxyz=/"
		"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
		# end of input in or before attribute value, or we have the
		# '/' from a '/>' ending
		return -1
		if self.strict:
		self.updatepos(i, j)
		self.error("malformed start tag")
		if j > i:
		return j
		else:
		return i + 1
		raise AssertionError("we should not get here!")

		# Internal -- parse endtag, return end or -1 if incomplete
		def parse_endtag(self, i):
		rawdata = self.rawdata
		assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
		match = endendtag.search(rawdata, i+1) # >
		if not match:
		return -1
		gtpos = match.end()
		match = endtagfind.match(rawdata, i) # </ + tag + >
		if not match:
		if self.cdata_elem is not None:
		self.handle_data(rawdata[i:gtpos])
		return gtpos
		if self.strict:
		self.error("bad end tag: %r" % (rawdata[i:gtpos],))
		# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
		namematch = tagfind_tolerant.match(rawdata, i+2)
		if not namematch:
		# w3.org/TR/html5/tokenization.html#end-tag-open-state
		if rawdata[i:i+3] == '</>':
		return i+3
		else:
		return self.parse_bogus_comment(i)
		tagname = namematch.group().lower()
		# consume and ignore other stuff between the name and the >
		# Note: this is not 100% correct, since we might have things like
		# </tag attr=">">, but looking for > after tha name should cover
		# most of the cases and is much simpler
		gtpos = rawdata.find('>', namematch.end())
		self.handle_endtag(tagname)
		return gtpos+1

		elem = match.group(1).lower() # script or style
		if self.cdata_elem is not None:
		if elem != self.cdata_elem:
		self.handle_data(rawdata[i:gtpos])
		return gtpos

		self.handle_endtag(elem.lower())
		self.clear_cdata_mode()
		return gtpos

		# Overridable -- finish processing of start+end tag: <tag.../>
		def handle_startendtag(self, tag, attrs):
		self.handle_starttag(tag, attrs)
		self.handle_endtag(tag)

		# Overridable -- handle start tag
		def handle_starttag(self, tag, attrs):
		pass

		# Overridable -- handle end tag
		def handle_endtag(self, tag):
		pass

		# Overridable -- handle character reference
		def handle_charref(self, name):
		pass

		# Overridable -- handle entity reference
		def handle_entityref(self, name):
		pass

		# Overridable -- handle data
		def handle_data(self, data):
		pass

		# Overridable -- handle comment
		def handle_comment(self, data):
		pass

		# Overridable -- handle declaration
		def handle_decl(self, decl):
		pass

		# Overridable -- handle processing instruction
		def handle_pi(self, data):
		pass

		def unknown_decl(self, data):
		if self.strict:
		self.error("unknown declaration: %r" % (data,))

		# Internal -- helper to remove special character quoting
		def unescape(self, s):
		if '&' not in s:
		return s
		def replaceEntities(s):
		s = s.groups()[0]
		try:
		if s[0] == "#":
		s = s[1:]
		if s[0] in ['x','X']:
		c = int(s[1:].rstrip(';'), 16)
		else:
		c = int(s.rstrip(';'))
		return chr(c)
		except ValueError:
		return '&#' + s
		else:
		from html.entities import html5
		if s in html5:
		return html5[s]
		elif s.endswith(';'):
		return '&' + s
		for x in range(2, len(s)):
		if s[:x] in html5:
		return html5[s[:x]] + s[x:]
		else:
		return '&' + s

		return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;\|\w{1,32};?))",
		replaceEntities, s, flags=re.ASCII)