cafe-grader-web Files · lib/assets/Lib/tokenize.py

cafe-grader-web

Location: cafe-grader-web/lib/assets/Lib/tokenize.py - annotation

Commit Description:

merge with algo and add brython files that were missing

Commit Description:

merge with algo and add brython files that were missing

References:

r584:056392c00e67 java

File last commit:

r584:056392c00e67 java

Show/Diff file:

Action:

            
                    lib/assets/Lib/tokenize.py
                
             | 682 lines
             | 24.5 KiB
             | text/x-python 
             | PythonLexer
             | 
      
            History
        
           Show Full History
         |
          Source
         | Raw
         |
              
              Download
              
     admin
  
merge with algo and add brython files that were missing

              r584
            
      """Tokenization help for Python programs.

      tokenize(readline) is a generator that breaks a stream of bytes into

      Python tokens.  It decodes the bytes according to PEP-0263 for

      determining source file encoding.

      It accepts a readline-like method which is called repeatedly to get the

      next line of input (or b"" for EOF).  It generates 5-tuples with these

      members:

          the token type (see token.py)

          the token (a string)

          the starting (row, column) indices of the token (a 2-tuple of ints)

          the ending (row, column) indices of the token (a 2-tuple of ints)

          the original line (string)

      It is designed to match the working of the Python tokenizer exactly, except

      that it produces COMMENT tokens for comments and gives type OP for all

      operators.  Additionally, all token lists start with an ENCODING token

      which tells you which encoding was used to decode the bytes stream.

      """

      __author__ = 'Ka-Ping Yee <ping@lfw.org>'

      __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

                     'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

                     'Michael Foord')

      import builtins

      import re

      import sys

      from token import *

      from codecs import lookup, BOM_UTF8

      import collections

      from io import TextIOWrapper

      cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)

      import token

      __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

                                 "NL", "untokenize", "ENCODING", "TokenInfo"]

      del token

      COMMENT = N_TOKENS

      tok_name[COMMENT] = 'COMMENT'

      NL = N_TOKENS + 1

      tok_name[NL] = 'NL'

      ENCODING = N_TOKENS + 2

      tok_name[ENCODING] = 'ENCODING'

      N_TOKENS += 3

      EXACT_TOKEN_TYPES = {

          '(':   LPAR,

          ')':   RPAR,

          '[':   LSQB,

          ']':   RSQB,

          ':':   COLON,

          ',':   COMMA,

          ';':   SEMI,

          '+':   PLUS,

          '-':   MINUS,

          '*':   STAR,

          '/':   SLASH,

          '|':   VBAR,

          '&':   AMPER,

          '<':   LESS,

          '>':   GREATER,

          '=':   EQUAL,

          '.':   DOT,

          '%':   PERCENT,

          '{':   LBRACE,

          '}':   RBRACE,

          '==':  EQEQUAL,

          '!=':  NOTEQUAL,

          '<=':  LESSEQUAL,

          '>=':  GREATEREQUAL,

          '~':   TILDE,

          '^':   CIRCUMFLEX,

          '<<':  LEFTSHIFT,

          '>>':  RIGHTSHIFT,

          '**':  DOUBLESTAR,

          '+=':  PLUSEQUAL,

          '-=':  MINEQUAL,

          '*=':  STAREQUAL,

          '/=':  SLASHEQUAL,

          '%=':  PERCENTEQUAL,

          '&=':  AMPEREQUAL,

          '|=':  VBAREQUAL,

          '^=': CIRCUMFLEXEQUAL,

          '<<=': LEFTSHIFTEQUAL,

          '>>=': RIGHTSHIFTEQUAL,

          '**=': DOUBLESTAREQUAL,

          '//':  DOUBLESLASH,

          '//=': DOUBLESLASHEQUAL,

          '@':   AT

      }

      class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

          def __repr__(self):

              annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

              return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

                      self._replace(type=annotated_type))

          @property

          def exact_type(self):

              if self.type == OP and self.string in EXACT_TOKEN_TYPES:

                  return EXACT_TOKEN_TYPES[self.string]

              else:

                  return self.type

      def group(*choices): return '(' + '|'.join(choices) + ')'

      def any(*choices): return group(*choices) + '*'

      def maybe(*choices): return group(*choices) + '?'

      # Note: we use unicode matching for names ("\w") but ascii matching for

      # number literals.

      Whitespace = r'[ \f\t]*'

      Comment = r'#[^\r\n]*'

      Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

      Name = r'\w+'

      Hexnumber = r'0[xX][0-9a-fA-F]+'

      Binnumber = r'0[bB][01]+'

      Octnumber = r'0[oO][0-7]+'

      Decnumber = r'(?:0+|[1-9][0-9]*)'

      Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

      Exponent = r'[eE][-+]?[0-9]+'

      Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

      Expfloat = r'[0-9]+' + Exponent

      Floatnumber = group(Pointfloat, Expfloat)

      Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

      Number = group(Imagnumber, Floatnumber, Intnumber)

      StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'

      # Tail end of ' string.

      Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

      # Tail end of " string.

      Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

      # Tail end of ''' string.

      Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

      # Tail end of """ string.

      Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

      Triple = group(StringPrefix + "'''", StringPrefix + '"""')

      # Single-line ' or " string.

      String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

                     StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

      # Because of leftmost-then-longest match semantics, be sure to put the

      # longest operators first (e.g., if = came before ==, == would get

      # recognized as two instances of =).

      Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

                       r"//=?", r"->",

                       r"[+\-*/%&|^=<>]=?",

                       r"~")

      Bracket = '[][(){}]'

      Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

      Funny = group(Operator, Bracket, Special)

      PlainToken = group(Number, Funny, String, Name)

      Token = Ignore + PlainToken

      # First (or only) line of ' or " string.

      ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

                      group("'", r'\\\r?\n'),

                      StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

                      group('"', r'\\\r?\n'))

      PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)

      PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

      def _compile(expr):

          return re.compile(expr, re.UNICODE)

      endpats = {"'": Single, '"': Double,

                 "'''": Single3, '"""': Double3,

                 "r'''": Single3, 'r"""': Double3,

                 "b'''": Single3, 'b"""': Double3,

                 "R'''": Single3, 'R"""': Double3,

                 "B'''": Single3, 'B"""': Double3,

                 "br'''": Single3, 'br"""': Double3,

                 "bR'''": Single3, 'bR"""': Double3,

                 "Br'''": Single3, 'Br"""': Double3,

                 "BR'''": Single3, 'BR"""': Double3,

                 "rb'''": Single3, 'rb"""': Double3,

                 "Rb'''": Single3, 'Rb"""': Double3,

                 "rB'''": Single3, 'rB"""': Double3,

                 "RB'''": Single3, 'RB"""': Double3,

                 "u'''": Single3, 'u"""': Double3,

                 "R'''": Single3, 'R"""': Double3,

                 "U'''": Single3, 'U"""': Double3,

                 'r': None, 'R': None, 'b': None, 'B': None,

                 'u': None, 'U': None}

      triple_quoted = {}

      for t in ("'''", '"""',

                "r'''", 'r"""', "R'''", 'R"""',

                "b'''", 'b"""', "B'''", 'B"""',

                "br'''", 'br"""', "Br'''", 'Br"""',

                "bR'''", 'bR"""', "BR'''", 'BR"""',

                "rb'''", 'rb"""', "rB'''", 'rB"""',

                "Rb'''", 'Rb"""', "RB'''", 'RB"""',

                "u'''", 'u"""', "U'''", 'U"""',

                ):

          triple_quoted[t] = t

      single_quoted = {}

      for t in ("'", '"',

                "r'", 'r"', "R'", 'R"',

                "b'", 'b"', "B'", 'B"',

                "br'", 'br"', "Br'", 'Br"',

                "bR'", 'bR"', "BR'", 'BR"' ,

                "rb'", 'rb"', "rB'", 'rB"',

                "Rb'", 'Rb"', "RB'", 'RB"' ,

                "u'", 'u"', "U'", 'U"',

                ):

          single_quoted[t] = t

      tabsize = 8

      class TokenError(Exception): pass

      class StopTokenizing(Exception): pass

      class Untokenizer:

          def __init__(self):

              self.tokens = []

              self.prev_row = 1

              self.prev_col = 0

              self.encoding = None

          def add_whitespace(self, start):

              row, col = start

              assert row <= self.prev_row

              col_offset = col - self.prev_col

              if col_offset:

                  self.tokens.append(" " * col_offset)

          def untokenize(self, iterable):

              for t in iterable:

                  if len(t) == 2:

                      self.compat(t, iterable)

                      break

                  tok_type, token, start, end, line = t

                  if tok_type == ENCODING:

                      self.encoding = token

                      continue

                  self.add_whitespace(start)

                  self.tokens.append(token)

                  self.prev_row, self.prev_col = end

                  if tok_type in (NEWLINE, NL):

                      self.prev_row += 1

                      self.prev_col = 0

              return "".join(self.tokens)

          def compat(self, token, iterable):

              startline = False

              indents = []

              toks_append = self.tokens.append

              toknum, tokval = token

              if toknum in (NAME, NUMBER):

                  tokval += ' '

              if toknum in (NEWLINE, NL):

                  startline = True

              prevstring = False

              for tok in iterable:

                  toknum, tokval = tok[:2]

                  if toknum == ENCODING:

                      self.encoding = tokval

                      continue

                  if toknum in (NAME, NUMBER):

                      tokval += ' '

                  # Insert a space between two consecutive strings

                  if toknum == STRING:

                      if prevstring:

                          tokval = ' ' + tokval

                      prevstring = True

                  else:

                      prevstring = False

                  if toknum == INDENT:

                      indents.append(tokval)

                      continue

                  elif toknum == DEDENT:

                      indents.pop()

                      continue

                  elif toknum in (NEWLINE, NL):

                      startline = True

                  elif startline and indents:

                      toks_append(indents[-1])

                      startline = False

                  toks_append(tokval)

      def untokenize(iterable):

          """Transform tokens back into Python source code.

          It returns a bytes object, encoded using the ENCODING

          token, which is the first token sequence output by tokenize.

          Each element returned by the iterable must be a token sequence

          with at least two elements, a token number and token value.  If

          only two tokens are passed, the resulting output is poor.

          Round-trip invariant for full input:

              Untokenized source will match input source exactly

          Round-trip invariant for limited intput:

              # Output bytes will tokenize the back to the input

              t1 = [tok[:2] for tok in tokenize(f.readline)]

              newcode = untokenize(t1)

              readline = BytesIO(newcode).readline

              t2 = [tok[:2] for tok in tokenize(readline)]

              assert t1 == t2

          """

          ut = Untokenizer()

          out = ut.untokenize(iterable)

          if ut.encoding is not None:

              out = out.encode(ut.encoding)

          return out

      def _get_normal_name(orig_enc):

          """Imitates get_normal_name in tokenizer.c."""

          # Only care about the first 12 characters.

          enc = orig_enc[:12].lower().replace("_", "-")

          if enc == "utf-8" or enc.startswith("utf-8-"):

              return "utf-8"

          if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

             enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

              return "iso-8859-1"

          return orig_enc

      def detect_encoding(readline):

          """

          The detect_encoding() function is used to detect the encoding that should

          be used to decode a Python source file.  It requires one argment, readline,

          in the same way as the tokenize() generator.

          It will call readline a maximum of twice, and return the encoding used

          (as a string) and a list of any lines (left as bytes) it has read in.

          It detects the encoding from the presence of a utf-8 bom or an encoding

          cookie as specified in pep-0263.  If both a bom and a cookie are present,

          but disagree, a SyntaxError will be raised.  If the encoding cookie is an

          invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,

          'utf-8-sig' is returned.

          If no encoding is specified, then the default of 'utf-8' will be returned.

          """

          try:

              filename = readline.__self__.name

          except AttributeError:

              filename = None

          bom_found = False

          encoding = None

          default = 'utf-8'

          def read_or_stop():

              try:

                  return readline()

              except StopIteration:

                  return b''

          def find_cookie(line):

              try:

                  # Decode as UTF-8. Either the line is an encoding declaration,

                  # in which case it should be pure ASCII, or it must be UTF-8

                  # per default encoding.

                  line_string = line.decode('utf-8')

              except UnicodeDecodeError:

                  msg = "invalid or missing encoding declaration"

                  if filename is not None:

                      msg = '{} for {!r}'.format(msg, filename)

                  raise SyntaxError(msg)

              match = cookie_re.match(line_string)

              if not match:

                  return None

              encoding = _get_normal_name(match.group(1))

              try:

                  codec = lookup(encoding)

              except LookupError:

                  # This behaviour mimics the Python interpreter

                  if filename is None:

                      msg = "unknown encoding: " + encoding

                  else:

                      msg = "unknown encoding for {!r}: {}".format(filename,

                              encoding)

                  raise SyntaxError(msg)

              if bom_found:

                  if encoding != 'utf-8':

                      # This behaviour mimics the Python interpreter

                      if filename is None:

                          msg = 'encoding problem: utf-8'

                      else:

                          msg = 'encoding problem for {!r}: utf-8'.format(filename)

                      raise SyntaxError(msg)

                  encoding += '-sig'

              return encoding

          first = read_or_stop()

          if first.startswith(BOM_UTF8):

              bom_found = True

              first = first[3:]

              default = 'utf-8-sig'

          if not first:

              return default, []

          encoding = find_cookie(first)

          if encoding:

              return encoding, [first]

          second = read_or_stop()

          if not second:

              return default, [first]

          encoding = find_cookie(second)

          if encoding:

              return encoding, [first, second]

          return default, [first, second]

      def open(filename):

          """Open a file in read only mode using the encoding detected by

          detect_encoding().

          """

          buffer = builtins.open(filename, 'rb')

          encoding, lines = detect_encoding(buffer.readline)

          buffer.seek(0)

          text = TextIOWrapper(buffer, encoding, line_buffering=True)

          text.mode = 'r'

          return text

      def tokenize(readline):

          """

          The tokenize() generator requires one argment, readline, which

          must be a callable object which provides the same interface as the

          readline() method of built-in file objects.  Each call to the function

          should return one line of input as bytes.  Alternately, readline

          can be a callable function terminating with StopIteration:

              readline = open(myfile, 'rb').__next__  # Example of alternate readline

          The generator produces 5-tuples with these members: the token type; the

          token string; a 2-tuple (srow, scol) of ints specifying the row and

          column where the token begins in the source; a 2-tuple (erow, ecol) of

          ints specifying the row and column where the token ends in the source;

          and the line on which the token was found.  The line passed is the

          logical line; continuation lines are included.

          The first token sequence will always be an ENCODING token

          which tells you which encoding was used to decode the bytes stream.

          """

          # This import is here to avoid problems when the itertools module is not

          # built yet and tokenize is imported.

          from itertools import chain, repeat

          encoding, consumed = detect_encoding(readline)

          rl_gen = iter(readline, b"")

          empty = repeat(b"")

          return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

      def _tokenize(readline, encoding):

          lnum = parenlev = continued = 0

          numchars = '0123456789'

          contstr, needcont = '', 0

          contline = None

          indents = [0]

          if encoding is not None:

              if encoding == "utf-8-sig":

                  # BOM will already have been stripped.

                  encoding = "utf-8"

              yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

          while True:             # loop over lines in stream

              try:

                  line = readline()

              except StopIteration:

                  line = b''

              if encoding is not None:

                  line = line.decode(encoding)

              lnum += 1

              pos, max = 0, len(line)

              if contstr:                            # continued string

                  if not line:

                      raise TokenError("EOF in multi-line string", strstart)

                  endmatch = endprog.match(line)

                  if endmatch:

                      pos = end = endmatch.end(0)

                      yield TokenInfo(STRING, contstr + line[:end],

                             strstart, (lnum, end), contline + line)

                      contstr, needcont = '', 0

                      contline = None

                  elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

                      yield TokenInfo(ERRORTOKEN, contstr + line,

                                 strstart, (lnum, len(line)), contline)

                      contstr = ''

                      contline = None

                      continue

                  else:

                      contstr = contstr + line

                      contline = contline + line

                      continue

              elif parenlev == 0 and not continued:  # new statement

                  if not line: break

                  column = 0

                  while pos < max:                   # measure leading whitespace

                      if line[pos] == ' ':

                          column += 1

                      elif line[pos] == '\t':

                          column = (column//tabsize + 1)*tabsize

                      elif line[pos] == '\f':

                          column = 0

                      else:

                          break

                      pos += 1

                  if pos == max:

                      break

                  if line[pos] in '#\r\n':           # skip comments or blank lines

                      if line[pos] == '#':

                          comment_token = line[pos:].rstrip('\r\n')

                          nl_pos = pos + len(comment_token)

                          yield TokenInfo(COMMENT, comment_token,

                                 (lnum, pos), (lnum, pos + len(comment_token)), line)

                          yield TokenInfo(NL, line[nl_pos:],

                                 (lnum, nl_pos), (lnum, len(line)), line)

                      else:

                          yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],

                                 (lnum, pos), (lnum, len(line)), line)

                      continue

                  if column > indents[-1]:           # count indents or dedents

                      indents.append(column)

                      yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

                  while column < indents[-1]:

                      if column not in indents:

                          raise IndentationError(

                              "unindent does not match any outer indentation level",

                              ("<tokenize>", lnum, pos, line))

                      indents = indents[:-1]

                      yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

              else:                                  # continued statement

                  if not line:

                      raise TokenError("EOF in multi-line statement", (lnum, 0))

                  continued = 0

              while pos < max:

                  pseudomatch = _compile(PseudoToken).match(line, pos)

                  if pseudomatch:                                # scan for tokens

                      start, end = pseudomatch.span(1)

                      spos, epos, pos = (lnum, start), (lnum, end), end

                      if start == end:

                          continue

                      token, initial = line[start:end], line[start]

                      if (initial in numchars or                  # ordinary number

                          (initial == '.' and token != '.' and token != '...')):

                          yield TokenInfo(NUMBER, token, spos, epos, line)

                      elif initial in '\r\n':

                          yield TokenInfo(NL if parenlev > 0 else NEWLINE,

                                 token, spos, epos, line)

                      elif initial == '#':

                          assert not token.endswith("\n")

                          yield TokenInfo(COMMENT, token, spos, epos, line)

                      elif token in triple_quoted:

                          endprog = _compile(endpats[token])

                          endmatch = endprog.match(line, pos)

                          if endmatch:                           # all on one line

                              pos = endmatch.end(0)

                              token = line[start:pos]

                              yield TokenInfo(STRING, token, spos, (lnum, pos), line)

                          else:

                              strstart = (lnum, start)           # multiple lines

                              contstr = line[start:]

                              contline = line

                              break

                      elif initial in single_quoted or \

                          token[:2] in single_quoted or \

                          token[:3] in single_quoted:

                          if token[-1] == '\n':                  # continued string

                              strstart = (lnum, start)

                              endprog = _compile(endpats[initial] or

                                                 endpats[token[1]] or

                                                 endpats[token[2]])

                              contstr, needcont = line[start:], 1

                              contline = line

                              break

                          else:                                  # ordinary string

                              yield TokenInfo(STRING, token, spos, epos, line)

                      elif initial.isidentifier():               # ordinary name

                          yield TokenInfo(NAME, token, spos, epos, line)

                      elif initial == '\\':                      # continued stmt

                          continued = 1

                      else:

                          if initial in '([{':

                              parenlev += 1

                          elif initial in ')]}':

                              parenlev -= 1

                          yield TokenInfo(OP, token, spos, epos, line)

                  else:

                      yield TokenInfo(ERRORTOKEN, line[pos],

                                 (lnum, pos), (lnum, pos+1), line)

                      pos += 1

          for indent in indents[1:]:                 # pop remaining indent levels

              yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

          yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

      # An undocumented, backwards compatible, API for all the places in the standard

      # library that expect to be able to use tokenize with strings

      def generate_tokens(readline):

          return _tokenize(readline, None)

      def main():

          import argparse

          # Helper error handling routines

          def perror(message):

              print(message, file=sys.stderr)

          def error(message, filename=None, location=None):

              if location:

                  args = (filename,) + location + (message,)

                  perror("%s:%d:%d: error: %s" % args)

              elif filename:

                  perror("%s: error: %s" % (filename, message))

              else:

                  perror("error: %s" % message)

              sys.exit(1)

          # Parse the arguments and options

          parser = argparse.ArgumentParser(prog='python -m tokenize')

          parser.add_argument(dest='filename', nargs='?',

                              metavar='filename.py',

                              help='the file to tokenize; defaults to stdin')

          parser.add_argument('-e', '--exact', dest='exact', action='store_true',

                              help='display token names using the exact type')

          args = parser.parse_args()

          try:

              # Tokenize the input

              if args.filename:

                  filename = args.filename

                  with builtins.open(filename, 'rb') as f:

                      tokens = list(tokenize(f.readline))

              else:

                  filename = "<stdin>"

                  tokens = _tokenize(sys.stdin.readline, None)

              # Output the tokenization

              for token in tokens:

                  token_type = token.type

                  if args.exact:

                      token_type = token.exact_type

                  token_range = "%d,%d-%d,%d:" % (token.start + token.end)

                  print("%-20s%-15s%-15r" %

                        (token_range, tok_name[token_type], token.string))

          except IndentationError as err:

              line, column = err.args[1][1:3]

              error(err.args[0], filename, (line, column))

          except TokenError as err:

              line, column = err.args[1]

              error(err.args[0], filename, (line, column))

          except SyntaxError as err:

              error(err, filename)

          except IOError as err:

              error(err)

          except KeyboardInterrupt:

              print("interrupted\n")

          except Exception as err:

              perror("unexpected error: %s" % err)

              raise

      if __name__ == "__main__":

          main()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository permissions settings

Sign in to your account

Last Author

admin merge with algo and add brython files that were missing	r584	"""Tokenization help for Python programs.

		tokenize(readline) is a generator that breaks a stream of bytes into
		Python tokens. It decodes the bytes according to PEP-0263 for
		determining source file encoding.

		It accepts a readline-like method which is called repeatedly to get the
		next line of input (or b"" for EOF). It generates 5-tuples with these
		members:

		the token type (see token.py)
		the token (a string)
		the starting (row, column) indices of the token (a 2-tuple of ints)
		the ending (row, column) indices of the token (a 2-tuple of ints)
		the original line (string)

		It is designed to match the working of the Python tokenizer exactly, except
		that it produces COMMENT tokens for comments and gives type OP for all
		operators. Additionally, all token lists start with an ENCODING token
		which tells you which encoding was used to decode the bytes stream.
		"""

		__author__ = 'Ka-Ping Yee <ping@lfw.org>'
		__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
		'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
		'Michael Foord')
		import builtins
		import re
		import sys
		from token import *
		from codecs import lookup, BOM_UTF8
		import collections
		from io import TextIOWrapper
		cookie_re = re.compile(r'^[ \t\f]#.coding[:=][ \t]*([-\w.]+)', re.ASCII)

		import token
		__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
		"NL", "untokenize", "ENCODING", "TokenInfo"]
		del token

		COMMENT = N_TOKENS
		tok_name[COMMENT] = 'COMMENT'
		NL = N_TOKENS + 1
		tok_name[NL] = 'NL'
		ENCODING = N_TOKENS + 2
		tok_name[ENCODING] = 'ENCODING'
		N_TOKENS += 3
		EXACT_TOKEN_TYPES = {
		'(': LPAR,
		')': RPAR,
		'[': LSQB,
		']': RSQB,
		':': COLON,
		',': COMMA,
		';': SEMI,
		'+': PLUS,
		'-': MINUS,
		'*': STAR,
		'/': SLASH,
		'\|': VBAR,
		'&': AMPER,
		'<': LESS,
		'>': GREATER,
		'=': EQUAL,
		'.': DOT,
		'%': PERCENT,
		'{': LBRACE,
		'}': RBRACE,
		'==': EQEQUAL,
		'!=': NOTEQUAL,
		'<=': LESSEQUAL,
		'>=': GREATEREQUAL,
		'~': TILDE,
		'^': CIRCUMFLEX,
		'<<': LEFTSHIFT,
		'>>': RIGHTSHIFT,
		'**': DOUBLESTAR,
		'+=': PLUSEQUAL,
		'-=': MINEQUAL,
		'*=': STAREQUAL,
		'/=': SLASHEQUAL,
		'%=': PERCENTEQUAL,
		'&=': AMPEREQUAL,
		'\|=': VBAREQUAL,
		'^=': CIRCUMFLEXEQUAL,
		'<<=': LEFTSHIFTEQUAL,
		'>>=': RIGHTSHIFTEQUAL,
		'**=': DOUBLESTAREQUAL,
		'//': DOUBLESLASH,
		'//=': DOUBLESLASHEQUAL,
		'@': AT
		}

		class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
		def __repr__(self):
		annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
		return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
		self._replace(type=annotated_type))

		@property
		def exact_type(self):
		if self.type == OP and self.string in EXACT_TOKEN_TYPES:
		return EXACT_TOKEN_TYPES[self.string]
		else:
		return self.type

		def group(*choices): return '(' + '\|'.join(choices) + ')'
		def any(choices): return group(choices) + '*'
		def maybe(choices): return group(choices) + '?'

		# Note: we use unicode matching for names ("\w") but ascii matching for
		# number literals.
		Whitespace = r'[ \f\t]*'
		Comment = r'#[^\r\n]*'
		Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
		Name = r'\w+'

		Hexnumber = r'0[xX][0-9a-fA-F]+'
		Binnumber = r'0[bB][01]+'
		Octnumber = r'0[oO][0-7]+'
		Decnumber = r'(?:0+\|[1-9][0-9]*)'
		Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
		Exponent = r'[eE][-+]?[0-9]+'
		Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
		Expfloat = r'[0-9]+' + Exponent
		Floatnumber = group(Pointfloat, Expfloat)
		Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
		Number = group(Imagnumber, Floatnumber, Intnumber)

		StringPrefix = r'(?:[bB][rR]?\|[rR][bB]?\|[uU])?'

		# Tail end of ' string.
		Single = r"[^'\\](?:\\.[^'\\])*'"
		# Tail end of " string.
		Double = r'[^"\\](?:\\.[^"\\])*"'
		# Tail end of ''' string.
		Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
		# Tail end of """ string.
		Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
		Triple = group(StringPrefix + "'''", StringPrefix + '"""')
		# Single-line ' or " string.
		String = group(StringPrefix + r"'[^\n'\\](?:\\.[^\n'\\])*'",
		StringPrefix + r'"[^\n"\\](?:\\.[^\n"\\])*"')

		# Because of leftmost-then-longest match semantics, be sure to put the
		# longest operators first (e.g., if = came before ==, == would get
		# recognized as two instances of =).
		Operator = group(r"\\=?", r">>=?", r"<<=?", r"!=",
		r"//=?", r"->",
		r"[+\-*/%&\|^=<>]=?",
		r"~")

		Bracket = '[][(){}]'
		Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
		Funny = group(Operator, Bracket, Special)

		PlainToken = group(Number, Funny, String, Name)
		Token = Ignore + PlainToken

		# First (or only) line of ' or " string.
		ContStr = group(StringPrefix + r"'[^\n'\\](?:\\.[^\n'\\])*" +
		group("'", r'\\\r?\n'),
		StringPrefix + r'"[^\n"\\](?:\\.[^\n"\\])*' +
		group('"', r'\\\r?\n'))
		PseudoExtras = group(r'\\\r?\n\|\Z', Comment, Triple)
		PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

		def _compile(expr):
		return re.compile(expr, re.UNICODE)

		endpats = {"'": Single, '"': Double,
		"'''": Single3, '"""': Double3,
		"r'''": Single3, 'r"""': Double3,
		"b'''": Single3, 'b"""': Double3,
		"R'''": Single3, 'R"""': Double3,
		"B'''": Single3, 'B"""': Double3,
		"br'''": Single3, 'br"""': Double3,
		"bR'''": Single3, 'bR"""': Double3,
		"Br'''": Single3, 'Br"""': Double3,
		"BR'''": Single3, 'BR"""': Double3,
		"rb'''": Single3, 'rb"""': Double3,
		"Rb'''": Single3, 'Rb"""': Double3,
		"rB'''": Single3, 'rB"""': Double3,
		"RB'''": Single3, 'RB"""': Double3,
		"u'''": Single3, 'u"""': Double3,
		"R'''": Single3, 'R"""': Double3,
		"U'''": Single3, 'U"""': Double3,
		'r': None, 'R': None, 'b': None, 'B': None,
		'u': None, 'U': None}

		triple_quoted = {}
		for t in ("'''", '"""',
		"r'''", 'r"""', "R'''", 'R"""',
		"b'''", 'b"""', "B'''", 'B"""',
		"br'''", 'br"""', "Br'''", 'Br"""',
		"bR'''", 'bR"""', "BR'''", 'BR"""',
		"rb'''", 'rb"""', "rB'''", 'rB"""',
		"Rb'''", 'Rb"""', "RB'''", 'RB"""',
		"u'''", 'u"""', "U'''", 'U"""',
		):
		triple_quoted[t] = t
		single_quoted = {}
		for t in ("'", '"',
		"r'", 'r"', "R'", 'R"',
		"b'", 'b"', "B'", 'B"',
		"br'", 'br"', "Br'", 'Br"',
		"bR'", 'bR"', "BR'", 'BR"' ,
		"rb'", 'rb"', "rB'", 'rB"',
		"Rb'", 'Rb"', "RB'", 'RB"' ,
		"u'", 'u"', "U'", 'U"',
		):
		single_quoted[t] = t

		tabsize = 8

		class TokenError(Exception): pass

		class StopTokenizing(Exception): pass


		class Untokenizer:

		def __init__(self):
		self.tokens = []
		self.prev_row = 1
		self.prev_col = 0
		self.encoding = None

		def add_whitespace(self, start):
		row, col = start
		assert row <= self.prev_row
		col_offset = col - self.prev_col
		if col_offset:
		self.tokens.append(" " * col_offset)

		def untokenize(self, iterable):
		for t in iterable:
		if len(t) == 2:
		self.compat(t, iterable)
		break
		tok_type, token, start, end, line = t
		if tok_type == ENCODING:
		self.encoding = token
		continue
		self.add_whitespace(start)
		self.tokens.append(token)
		self.prev_row, self.prev_col = end
		if tok_type in (NEWLINE, NL):
		self.prev_row += 1
		self.prev_col = 0
		return "".join(self.tokens)

		def compat(self, token, iterable):
		startline = False
		indents = []
		toks_append = self.tokens.append
		toknum, tokval = token

		if toknum in (NAME, NUMBER):
		tokval += ' '
		if toknum in (NEWLINE, NL):
		startline = True
		prevstring = False
		for tok in iterable:
		toknum, tokval = tok[:2]
		if toknum == ENCODING:
		self.encoding = tokval
		continue

		if toknum in (NAME, NUMBER):
		tokval += ' '

		# Insert a space between two consecutive strings
		if toknum == STRING:
		if prevstring:
		tokval = ' ' + tokval
		prevstring = True
		else:
		prevstring = False

		if toknum == INDENT:
		indents.append(tokval)
		continue
		elif toknum == DEDENT:
		indents.pop()
		continue
		elif toknum in (NEWLINE, NL):
		startline = True
		elif startline and indents:
		toks_append(indents[-1])
		startline = False
		toks_append(tokval)


		def untokenize(iterable):
		"""Transform tokens back into Python source code.
		It returns a bytes object, encoded using the ENCODING
		token, which is the first token sequence output by tokenize.

		Each element returned by the iterable must be a token sequence
		with at least two elements, a token number and token value. If
		only two tokens are passed, the resulting output is poor.

		Round-trip invariant for full input:
		Untokenized source will match input source exactly

		Round-trip invariant for limited intput:
		# Output bytes will tokenize the back to the input
		t1 = [tok[:2] for tok in tokenize(f.readline)]
		newcode = untokenize(t1)
		readline = BytesIO(newcode).readline
		t2 = [tok[:2] for tok in tokenize(readline)]
		assert t1 == t2
		"""
		ut = Untokenizer()
		out = ut.untokenize(iterable)
		if ut.encoding is not None:
		out = out.encode(ut.encoding)
		return out


		def _get_normal_name(orig_enc):
		"""Imitates get_normal_name in tokenizer.c."""
		# Only care about the first 12 characters.
		enc = orig_enc[:12].lower().replace("_", "-")
		if enc == "utf-8" or enc.startswith("utf-8-"):
		return "utf-8"
		if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
		enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
		return "iso-8859-1"
		return orig_enc

		def detect_encoding(readline):
		"""
		The detect_encoding() function is used to detect the encoding that should
		be used to decode a Python source file. It requires one argment, readline,
		in the same way as the tokenize() generator.

		It will call readline a maximum of twice, and return the encoding used
		(as a string) and a list of any lines (left as bytes) it has read in.

		It detects the encoding from the presence of a utf-8 bom or an encoding
		cookie as specified in pep-0263. If both a bom and a cookie are present,
		but disagree, a SyntaxError will be raised. If the encoding cookie is an
		invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
		'utf-8-sig' is returned.

		If no encoding is specified, then the default of 'utf-8' will be returned.
		"""
		try:
		filename = readline.__self__.name
		except AttributeError:
		filename = None
		bom_found = False
		encoding = None
		default = 'utf-8'
		def read_or_stop():
		try:
		return readline()
		except StopIteration:
		return b''

		def find_cookie(line):
		try:
		# Decode as UTF-8. Either the line is an encoding declaration,
		# in which case it should be pure ASCII, or it must be UTF-8
		# per default encoding.
		line_string = line.decode('utf-8')
		except UnicodeDecodeError:
		msg = "invalid or missing encoding declaration"
		if filename is not None:
		msg = '{} for {!r}'.format(msg, filename)
		raise SyntaxError(msg)

		match = cookie_re.match(line_string)
		if not match:
		return None
		encoding = _get_normal_name(match.group(1))
		try:
		codec = lookup(encoding)
		except LookupError:
		# This behaviour mimics the Python interpreter
		if filename is None:
		msg = "unknown encoding: " + encoding
		else:
		msg = "unknown encoding for {!r}: {}".format(filename,
		encoding)
		raise SyntaxError(msg)

		if bom_found:
		if encoding != 'utf-8':
		# This behaviour mimics the Python interpreter
		if filename is None:
		msg = 'encoding problem: utf-8'
		else:
		msg = 'encoding problem for {!r}: utf-8'.format(filename)
		raise SyntaxError(msg)
		encoding += '-sig'
		return encoding

		first = read_or_stop()
		if first.startswith(BOM_UTF8):
		bom_found = True
		first = first[3:]
		default = 'utf-8-sig'
		if not first:
		return default, []

		encoding = find_cookie(first)
		if encoding:
		return encoding, [first]

		second = read_or_stop()
		if not second:
		return default, [first]

		encoding = find_cookie(second)
		if encoding:
		return encoding, [first, second]

		return default, [first, second]


		def open(filename):
		"""Open a file in read only mode using the encoding detected by
		detect_encoding().
		"""
		buffer = builtins.open(filename, 'rb')
		encoding, lines = detect_encoding(buffer.readline)
		buffer.seek(0)
		text = TextIOWrapper(buffer, encoding, line_buffering=True)
		text.mode = 'r'
		return text


		def tokenize(readline):
		"""
		The tokenize() generator requires one argment, readline, which
		must be a callable object which provides the same interface as the
		readline() method of built-in file objects. Each call to the function
		should return one line of input as bytes. Alternately, readline
		can be a callable function terminating with StopIteration:
		readline = open(myfile, 'rb').__next__ # Example of alternate readline

		The generator produces 5-tuples with these members: the token type; the
		token string; a 2-tuple (srow, scol) of ints specifying the row and
		column where the token begins in the source; a 2-tuple (erow, ecol) of
		ints specifying the row and column where the token ends in the source;
		and the line on which the token was found. The line passed is the
		logical line; continuation lines are included.

		The first token sequence will always be an ENCODING token
		which tells you which encoding was used to decode the bytes stream.
		"""
		# This import is here to avoid problems when the itertools module is not
		# built yet and tokenize is imported.
		from itertools import chain, repeat
		encoding, consumed = detect_encoding(readline)
		rl_gen = iter(readline, b"")
		empty = repeat(b"")
		return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)


		def _tokenize(readline, encoding):
		lnum = parenlev = continued = 0
		numchars = '0123456789'
		contstr, needcont = '', 0
		contline = None
		indents = [0]

		if encoding is not None:
		if encoding == "utf-8-sig":
		# BOM will already have been stripped.
		encoding = "utf-8"
		yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
		while True: # loop over lines in stream
		try:
		line = readline()
		except StopIteration:
		line = b''

		if encoding is not None:
		line = line.decode(encoding)
		lnum += 1
		pos, max = 0, len(line)

		if contstr: # continued string
		if not line:
		raise TokenError("EOF in multi-line string", strstart)
		endmatch = endprog.match(line)
		if endmatch:
		pos = end = endmatch.end(0)
		yield TokenInfo(STRING, contstr + line[:end],
		strstart, (lnum, end), contline + line)
		contstr, needcont = '', 0
		contline = None
		elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
		yield TokenInfo(ERRORTOKEN, contstr + line,
		strstart, (lnum, len(line)), contline)
		contstr = ''
		contline = None
		continue
		else:
		contstr = contstr + line
		contline = contline + line
		continue

		elif parenlev == 0 and not continued: # new statement
		if not line: break
		column = 0
		while pos < max: # measure leading whitespace
		if line[pos] == ' ':
		column += 1
		elif line[pos] == '\t':
		column = (column//tabsize + 1)*tabsize
		elif line[pos] == '\f':
		column = 0
		else:
		break
		pos += 1
		if pos == max:
		break

		if line[pos] in '#\r\n': # skip comments or blank lines
		if line[pos] == '#':
		comment_token = line[pos:].rstrip('\r\n')
		nl_pos = pos + len(comment_token)
		yield TokenInfo(COMMENT, comment_token,
		(lnum, pos), (lnum, pos + len(comment_token)), line)
		yield TokenInfo(NL, line[nl_pos:],
		(lnum, nl_pos), (lnum, len(line)), line)
		else:
		yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
		(lnum, pos), (lnum, len(line)), line)
		continue

		if column > indents[-1]: # count indents or dedents
		indents.append(column)
		yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
		while column < indents[-1]:
		if column not in indents:
		raise IndentationError(
		"unindent does not match any outer indentation level",
		("<tokenize>", lnum, pos, line))
		indents = indents[:-1]
		yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

		else: # continued statement
		if not line:
		raise TokenError("EOF in multi-line statement", (lnum, 0))
		continued = 0

		while pos < max:
		pseudomatch = _compile(PseudoToken).match(line, pos)
		if pseudomatch: # scan for tokens
		start, end = pseudomatch.span(1)
		spos, epos, pos = (lnum, start), (lnum, end), end
		if start == end:
		continue
		token, initial = line[start:end], line[start]

		if (initial in numchars or # ordinary number
		(initial == '.' and token != '.' and token != '...')):
		yield TokenInfo(NUMBER, token, spos, epos, line)
		elif initial in '\r\n':
		yield TokenInfo(NL if parenlev > 0 else NEWLINE,
		token, spos, epos, line)
		elif initial == '#':
		assert not token.endswith("\n")
		yield TokenInfo(COMMENT, token, spos, epos, line)
		elif token in triple_quoted:
		endprog = _compile(endpats[token])
		endmatch = endprog.match(line, pos)
		if endmatch: # all on one line
		pos = endmatch.end(0)
		token = line[start:pos]
		yield TokenInfo(STRING, token, spos, (lnum, pos), line)
		else:
		strstart = (lnum, start) # multiple lines
		contstr = line[start:]
		contline = line
		break
		elif initial in single_quoted or \
		token[:2] in single_quoted or \
		token[:3] in single_quoted:
		if token[-1] == '\n': # continued string
		strstart = (lnum, start)
		endprog = _compile(endpats[initial] or
		endpats[token[1]] or
		endpats[token[2]])
		contstr, needcont = line[start:], 1
		contline = line
		break
		else: # ordinary string
		yield TokenInfo(STRING, token, spos, epos, line)
		elif initial.isidentifier(): # ordinary name
		yield TokenInfo(NAME, token, spos, epos, line)
		elif initial == '\\': # continued stmt
		continued = 1
		else:
		if initial in '([{':
		parenlev += 1
		elif initial in ')]}':
		parenlev -= 1
		yield TokenInfo(OP, token, spos, epos, line)
		else:
		yield TokenInfo(ERRORTOKEN, line[pos],
		(lnum, pos), (lnum, pos+1), line)
		pos += 1

		for indent in indents[1:]: # pop remaining indent levels
		yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
		yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')


		# An undocumented, backwards compatible, API for all the places in the standard
		# library that expect to be able to use tokenize with strings
		def generate_tokens(readline):
		return _tokenize(readline, None)

		def main():
		import argparse

		# Helper error handling routines
		def perror(message):
		print(message, file=sys.stderr)

		def error(message, filename=None, location=None):
		if location:
		args = (filename,) + location + (message,)
		perror("%s:%d:%d: error: %s" % args)
		elif filename:
		perror("%s: error: %s" % (filename, message))
		else:
		perror("error: %s" % message)
		sys.exit(1)

		# Parse the arguments and options
		parser = argparse.ArgumentParser(prog='python -m tokenize')
		parser.add_argument(dest='filename', nargs='?',
		metavar='filename.py',
		help='the file to tokenize; defaults to stdin')
		parser.add_argument('-e', '--exact', dest='exact', action='store_true',
		help='display token names using the exact type')
		args = parser.parse_args()

		try:
		# Tokenize the input
		if args.filename:
		filename = args.filename
		with builtins.open(filename, 'rb') as f:
		tokens = list(tokenize(f.readline))
		else:
		filename = "<stdin>"
		tokens = _tokenize(sys.stdin.readline, None)

		# Output the tokenization
		for token in tokens:
		token_type = token.type
		if args.exact:
		token_type = token.exact_type
		token_range = "%d,%d-%d,%d:" % (token.start + token.end)
		print("%-20s%-15s%-15r" %
		(token_range, tok_name[token_type], token.string))
		except IndentationError as err:
		line, column = err.args[1][1:3]
		error(err.args[0], filename, (line, column))
		except TokenError as err:
		line, column = err.args[1]
		error(err.args[0], filename, (line, column))
		except SyntaxError as err:
		error(err, filename)
		except IOError as err:
		error(err)
		except KeyboardInterrupt:
		print("interrupted\n")
		except Exception as err:
		perror("unexpected error: %s" % err)
		raise

		if __name__ == "__main__":
		main()