Show More
Commit Description:
merge with algo and add brython files that were missing
Commit Description:
merge with algo and add brython files that were missing
References:
File last commit:
Show/Diff file:
Action:
lib/assets/Lib/codecs.py
| 1099 lines
| 35.7 KiB
| text/x-python
| PythonLexer
|
r584 | """ codecs -- Python Codec Registry, API and helpers. | |||
Written by Marc-Andre Lemburg (mal@lemburg.com). | ||||
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | ||||
"""#" | ||||
import builtins, sys | ||||
### Registry and builtin stateless codec functions | ||||
try: | ||||
from _codecs import * | ||||
except ImportError as why: | ||||
raise SystemError('Failed to load the builtin codecs: %s' % why) | ||||
__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", | ||||
"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", | ||||
"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", | ||||
"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", | ||||
"strict_errors", "ignore_errors", "replace_errors", | ||||
"xmlcharrefreplace_errors", | ||||
"register_error", "lookup_error"] | ||||
### Constants | ||||
# | ||||
# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) | ||||
# and its possible byte string values | ||||
# for UTF8/UTF16/UTF32 output and little/big endian machines | ||||
# | ||||
# UTF-8 | ||||
BOM_UTF8 = b'\xef\xbb\xbf' | ||||
# UTF-16, little endian | ||||
BOM_LE = BOM_UTF16_LE = b'\xff\xfe' | ||||
# UTF-16, big endian | ||||
BOM_BE = BOM_UTF16_BE = b'\xfe\xff' | ||||
# UTF-32, little endian | ||||
BOM_UTF32_LE = b'\xff\xfe\x00\x00' | ||||
# UTF-32, big endian | ||||
BOM_UTF32_BE = b'\x00\x00\xfe\xff' | ||||
if sys.byteorder == 'little': | ||||
# UTF-16, native endianness | ||||
BOM = BOM_UTF16 = BOM_UTF16_LE | ||||
# UTF-32, native endianness | ||||
BOM_UTF32 = BOM_UTF32_LE | ||||
else: | ||||
# UTF-16, native endianness | ||||
BOM = BOM_UTF16 = BOM_UTF16_BE | ||||
# UTF-32, native endianness | ||||
BOM_UTF32 = BOM_UTF32_BE | ||||
# Old broken names (don't use in new code) | ||||
BOM32_LE = BOM_UTF16_LE | ||||
BOM32_BE = BOM_UTF16_BE | ||||
BOM64_LE = BOM_UTF32_LE | ||||
BOM64_BE = BOM_UTF32_BE | ||||
### Codec base classes (defining the API) | ||||
class CodecInfo(tuple): | ||||
def __new__(cls, encode, decode, streamreader=None, streamwriter=None, | ||||
incrementalencoder=None, incrementaldecoder=None, name=None): | ||||
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) | ||||
self.name = name | ||||
self.encode = encode | ||||
self.decode = decode | ||||
self.incrementalencoder = incrementalencoder | ||||
self.incrementaldecoder = incrementaldecoder | ||||
self.streamwriter = streamwriter | ||||
self.streamreader = streamreader | ||||
return self | ||||
def __repr__(self): | ||||
return "<%s.%s object for encoding %s at 0x%x>" % \ | ||||
(self.__class__.__module__, self.__class__.__name__, | ||||
self.name, id(self)) | ||||
class Codec: | ||||
""" Defines the interface for stateless encoders/decoders. | ||||
The .encode()/.decode() methods may use different error | ||||
handling schemes by providing the errors argument. These | ||||
string values are predefined: | ||||
'strict' - raise a ValueError error (or a subclass) | ||||
'ignore' - ignore the character and continue with the next | ||||
'replace' - replace with a suitable replacement character; | ||||
Python will use the official U+FFFD REPLACEMENT | ||||
CHARACTER for the builtin Unicode codecs on | ||||
decoding and '?' on encoding. | ||||
'surrogateescape' - replace with private codepoints U+DCnn. | ||||
'xmlcharrefreplace' - Replace with the appropriate XML | ||||
character reference (only for encoding). | ||||
'backslashreplace' - Replace with backslashed escape sequences | ||||
(only for encoding). | ||||
The set of allowed values can be extended via register_error. | ||||
""" | ||||
def encode(self, input, errors='strict'): | ||||
""" Encodes the object input and returns a tuple (output | ||||
object, length consumed). | ||||
errors defines the error handling to apply. It defaults to | ||||
'strict' handling. | ||||
The method may not store state in the Codec instance. Use | ||||
StreamCodec for codecs which have to keep state in order to | ||||
make encoding/decoding efficient. | ||||
The encoder must be able to handle zero length input and | ||||
return an empty object of the output object type in this | ||||
situation. | ||||
""" | ||||
raise NotImplementedError | ||||
def decode(self, input, errors='strict'): | ||||
""" Decodes the object input and returns a tuple (output | ||||
object, length consumed). | ||||
input must be an object which provides the bf_getreadbuf | ||||
buffer slot. Python strings, buffer objects and memory | ||||
mapped files are examples of objects providing this slot. | ||||
errors defines the error handling to apply. It defaults to | ||||
'strict' handling. | ||||
The method may not store state in the Codec instance. Use | ||||
StreamCodec for codecs which have to keep state in order to | ||||
make encoding/decoding efficient. | ||||
The decoder must be able to handle zero length input and | ||||
return an empty object of the output object type in this | ||||
situation. | ||||
""" | ||||
raise NotImplementedError | ||||
class IncrementalEncoder(object): | ||||
""" | ||||
An IncrementalEncoder encodes an input in multiple steps. The input can | ||||
be passed piece by piece to the encode() method. The IncrementalEncoder | ||||
remembers the state of the encoding process between calls to encode(). | ||||
""" | ||||
def __init__(self, errors='strict'): | ||||
""" | ||||
Creates an IncrementalEncoder instance. | ||||
The IncrementalEncoder may use different error handling schemes by | ||||
providing the errors keyword argument. See the module docstring | ||||
for a list of possible values. | ||||
""" | ||||
self.errors = errors | ||||
self.buffer = "" | ||||
def encode(self, input, final=False): | ||||
""" | ||||
Encodes input and returns the resulting object. | ||||
""" | ||||
raise NotImplementedError | ||||
def reset(self): | ||||
""" | ||||
Resets the encoder to the initial state. | ||||
""" | ||||
def getstate(self): | ||||
""" | ||||
Return the current state of the encoder. | ||||
""" | ||||
return 0 | ||||
def setstate(self, state): | ||||
""" | ||||
Set the current state of the encoder. state must have been | ||||
returned by getstate(). | ||||
""" | ||||
class BufferedIncrementalEncoder(IncrementalEncoder): | ||||
""" | ||||
This subclass of IncrementalEncoder can be used as the baseclass for an | ||||
incremental encoder if the encoder must keep some of the output in a | ||||
buffer between calls to encode(). | ||||
""" | ||||
def __init__(self, errors='strict'): | ||||
IncrementalEncoder.__init__(self, errors) | ||||
# unencoded input that is kept between calls to encode() | ||||
self.buffer = "" | ||||
def _buffer_encode(self, input, errors, final): | ||||
# Overwrite this method in subclasses: It must encode input | ||||
# and return an (output, length consumed) tuple | ||||
raise NotImplementedError | ||||
def encode(self, input, final=False): | ||||
# encode input (taking the buffer into account) | ||||
data = self.buffer + input | ||||
(result, consumed) = self._buffer_encode(data, self.errors, final) | ||||
# keep unencoded input until the next call | ||||
self.buffer = data[consumed:] | ||||
return result | ||||
def reset(self): | ||||
IncrementalEncoder.reset(self) | ||||
self.buffer = "" | ||||
def getstate(self): | ||||
return self.buffer or 0 | ||||
def setstate(self, state): | ||||
self.buffer = state or "" | ||||
class IncrementalDecoder(object): | ||||
""" | ||||
An IncrementalDecoder decodes an input in multiple steps. The input can | ||||
be passed piece by piece to the decode() method. The IncrementalDecoder | ||||
remembers the state of the decoding process between calls to decode(). | ||||
""" | ||||
def __init__(self, errors='strict'): | ||||
""" | ||||
Create a IncrementalDecoder instance. | ||||
The IncrementalDecoder may use different error handling schemes by | ||||
providing the errors keyword argument. See the module docstring | ||||
for a list of possible values. | ||||
""" | ||||
self.errors = errors | ||||
def decode(self, input, final=False): | ||||
""" | ||||
Decode input and returns the resulting object. | ||||
""" | ||||
raise NotImplementedError | ||||
def reset(self): | ||||
""" | ||||
Reset the decoder to the initial state. | ||||
""" | ||||
def getstate(self): | ||||
""" | ||||
Return the current state of the decoder. | ||||
This must be a (buffered_input, additional_state_info) tuple. | ||||
buffered_input must be a bytes object containing bytes that | ||||
were passed to decode() that have not yet been converted. | ||||
additional_state_info must be a non-negative integer | ||||
representing the state of the decoder WITHOUT yet having | ||||
processed the contents of buffered_input. In the initial state | ||||
and after reset(), getstate() must return (b"", 0). | ||||
""" | ||||
return (b"", 0) | ||||
def setstate(self, state): | ||||
""" | ||||
Set the current state of the decoder. | ||||
state must have been returned by getstate(). The effect of | ||||
setstate((b"", 0)) must be equivalent to reset(). | ||||
""" | ||||
class BufferedIncrementalDecoder(IncrementalDecoder): | ||||
""" | ||||
This subclass of IncrementalDecoder can be used as the baseclass for an | ||||
incremental decoder if the decoder must be able to handle incomplete | ||||
byte sequences. | ||||
""" | ||||
def __init__(self, errors='strict'): | ||||
IncrementalDecoder.__init__(self, errors) | ||||
# undecoded input that is kept between calls to decode() | ||||
self.buffer = b"" | ||||
def _buffer_decode(self, input, errors, final): | ||||
# Overwrite this method in subclasses: It must decode input | ||||
# and return an (output, length consumed) tuple | ||||
raise NotImplementedError | ||||
def decode(self, input, final=False): | ||||
# decode input (taking the buffer into account) | ||||
data = self.buffer + input | ||||
(result, consumed) = self._buffer_decode(data, self.errors, final) | ||||
# keep undecoded input until the next call | ||||
self.buffer = data[consumed:] | ||||
return result | ||||
def reset(self): | ||||
IncrementalDecoder.reset(self) | ||||
self.buffer = b"" | ||||
def getstate(self): | ||||
# additional state info is always 0 | ||||
return (self.buffer, 0) | ||||
def setstate(self, state): | ||||
# ignore additional state info | ||||
self.buffer = state[0] | ||||
# | ||||
# The StreamWriter and StreamReader class provide generic working | ||||
# interfaces which can be used to implement new encoding submodules | ||||
# very easily. See encodings/utf_8.py for an example on how this is | ||||
# done. | ||||
# | ||||
class StreamWriter(Codec): | ||||
def __init__(self, stream, errors='strict'): | ||||
""" Creates a StreamWriter instance. | ||||
stream must be a file-like object open for writing | ||||
(binary) data. | ||||
The StreamWriter may use different error handling | ||||
schemes by providing the errors keyword argument. These | ||||
parameters are predefined: | ||||
'strict' - raise a ValueError (or a subclass) | ||||
'ignore' - ignore the character and continue with the next | ||||
'replace'- replace with a suitable replacement character | ||||
'xmlcharrefreplace' - Replace with the appropriate XML | ||||
character reference. | ||||
'backslashreplace' - Replace with backslashed escape | ||||
sequences (only for encoding). | ||||
The set of allowed parameter values can be extended via | ||||
register_error. | ||||
""" | ||||
self.stream = stream | ||||
self.errors = errors | ||||
def write(self, object): | ||||
""" Writes the object's contents encoded to self.stream. | ||||
""" | ||||
data, consumed = self.encode(object, self.errors) | ||||
self.stream.write(data) | ||||
def writelines(self, list): | ||||
""" Writes the concatenated list of strings to the stream | ||||
using .write(). | ||||
""" | ||||
self.write(''.join(list)) | ||||
def reset(self): | ||||
""" Flushes and resets the codec buffers used for keeping state. | ||||
Calling this method should ensure that the data on the | ||||
output is put into a clean state, that allows appending | ||||
of new fresh data without having to rescan the whole | ||||
stream to recover state. | ||||
""" | ||||
pass | ||||
def seek(self, offset, whence=0): | ||||
self.stream.seek(offset, whence) | ||||
if whence == 0 and offset == 0: | ||||
self.reset() | ||||
def __getattr__(self, name, | ||||
getattr=getattr): | ||||
""" Inherit all other methods from the underlying stream. | ||||
""" | ||||
return getattr(self.stream, name) | ||||
def __enter__(self): | ||||
return self | ||||
def __exit__(self, type, value, tb): | ||||
self.stream.close() | ||||
### | ||||
class StreamReader(Codec): | ||||
charbuffertype = str | ||||
def __init__(self, stream, errors='strict'): | ||||
""" Creates a StreamReader instance. | ||||
stream must be a file-like object open for reading | ||||
(binary) data. | ||||
The StreamReader may use different error handling | ||||
schemes by providing the errors keyword argument. These | ||||
parameters are predefined: | ||||
'strict' - raise a ValueError (or a subclass) | ||||
'ignore' - ignore the character and continue with the next | ||||
'replace'- replace with a suitable replacement character; | ||||
The set of allowed parameter values can be extended via | ||||
register_error. | ||||
""" | ||||
self.stream = stream | ||||
self.errors = errors | ||||
self.bytebuffer = b"" | ||||
self._empty_charbuffer = self.charbuffertype() | ||||
self.charbuffer = self._empty_charbuffer | ||||
self.linebuffer = None | ||||
def decode(self, input, errors='strict'): | ||||
raise NotImplementedError | ||||
def read(self, size=-1, chars=-1, firstline=False): | ||||
""" Decodes data from the stream self.stream and returns the | ||||
resulting object. | ||||
chars indicates the number of characters to read from the | ||||
stream. read() will never return more than chars | ||||
characters, but it might return less, if there are not enough | ||||
characters available. | ||||
size indicates the approximate maximum number of bytes to | ||||
read from the stream for decoding purposes. The decoder | ||||
can modify this setting as appropriate. The default value | ||||
-1 indicates to read and decode as much as possible. size | ||||
is intended to prevent having to decode huge files in one | ||||
step. | ||||
If firstline is true, and a UnicodeDecodeError happens | ||||
after the first line terminator in the input only the first line | ||||
will be returned, the rest of the input will be kept until the | ||||
next call to read(). | ||||
The method should use a greedy read strategy meaning that | ||||
it should read as much data as is allowed within the | ||||
definition of the encoding and the given size, e.g. if | ||||
optional encoding endings or state markers are available | ||||
on the stream, these should be read too. | ||||
""" | ||||
# If we have lines cached, first merge them back into characters | ||||
if self.linebuffer: | ||||
self.charbuffer = self._empty_charbuffer.join(self.linebuffer) | ||||
self.linebuffer = None | ||||
# read until we get the required number of characters (if available) | ||||
while True: | ||||
# can the request be satisfied from the character buffer? | ||||
if chars < 0: | ||||
if size < 0: | ||||
if self.charbuffer: | ||||
break | ||||
elif len(self.charbuffer) >= size: | ||||
break | ||||
else: | ||||
if len(self.charbuffer) >= chars: | ||||
break | ||||
# we need more data | ||||
if size < 0: | ||||
newdata = self.stream.read() | ||||
else: | ||||
newdata = self.stream.read(size) | ||||
# decode bytes (those remaining from the last call included) | ||||
data = self.bytebuffer + newdata | ||||
try: | ||||
newchars, decodedbytes = self.decode(data, self.errors) | ||||
except UnicodeDecodeError as exc: | ||||
if firstline: | ||||
newchars, decodedbytes = \ | ||||
self.decode(data[:exc.start], self.errors) | ||||
lines = newchars.splitlines(keepends=True) | ||||
if len(lines)<=1: | ||||
raise | ||||
else: | ||||
raise | ||||
# keep undecoded bytes until the next call | ||||
self.bytebuffer = data[decodedbytes:] | ||||
# put new characters in the character buffer | ||||
self.charbuffer += newchars | ||||
# there was no data available | ||||
if not newdata: | ||||
break | ||||
if chars < 0: | ||||
# Return everything we've got | ||||
result = self.charbuffer | ||||
self.charbuffer = self._empty_charbuffer | ||||
else: | ||||
# Return the first chars characters | ||||
result = self.charbuffer[:chars] | ||||
self.charbuffer = self.charbuffer[chars:] | ||||
return result | ||||
def readline(self, size=None, keepends=True): | ||||
""" Read one line from the input stream and return the | ||||
decoded data. | ||||
size, if given, is passed as size argument to the | ||||
read() method. | ||||
""" | ||||
# If we have lines cached from an earlier read, return | ||||
# them unconditionally | ||||
if self.linebuffer: | ||||
line = self.linebuffer[0] | ||||
del self.linebuffer[0] | ||||
if len(self.linebuffer) == 1: | ||||
# revert to charbuffer mode; we might need more data | ||||
# next time | ||||
self.charbuffer = self.linebuffer[0] | ||||
self.linebuffer = None | ||||
if not keepends: | ||||
line = line.splitlines(keepends=False)[0] | ||||
return line | ||||
readsize = size or 72 | ||||
line = self._empty_charbuffer | ||||
# If size is given, we call read() only once | ||||
while True: | ||||
data = self.read(readsize, firstline=True) | ||||
if data: | ||||
# If we're at a "\r" read one extra character (which might | ||||
# be a "\n") to get a proper line ending. If the stream is | ||||
# temporarily exhausted we return the wrong line ending. | ||||
if (isinstance(data, str) and data.endswith("\r")) or \ | ||||
(isinstance(data, bytes) and data.endswith(b"\r")): | ||||
data += self.read(size=1, chars=1) | ||||
line += data | ||||
lines = line.splitlines(keepends=True) | ||||
if lines: | ||||
if len(lines) > 1: | ||||
# More than one line result; the first line is a full line | ||||
# to return | ||||
line = lines[0] | ||||
del lines[0] | ||||
if len(lines) > 1: | ||||
# cache the remaining lines | ||||
lines[-1] += self.charbuffer | ||||
self.linebuffer = lines | ||||
self.charbuffer = None | ||||
else: | ||||
# only one remaining line, put it back into charbuffer | ||||
self.charbuffer = lines[0] + self.charbuffer | ||||
if not keepends: | ||||
line = line.splitlines(keepends=False)[0] | ||||
break | ||||
line0withend = lines[0] | ||||
line0withoutend = lines[0].splitlines(keepends=False)[0] | ||||
if line0withend != line0withoutend: # We really have a line end | ||||
# Put the rest back together and keep it until the next call | ||||
self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \ | ||||
self.charbuffer | ||||
if keepends: | ||||
line = line0withend | ||||
else: | ||||
line = line0withoutend | ||||
break | ||||
# we didn't get anything or this was our only try | ||||
if not data or size is not None: | ||||
if line and not keepends: | ||||
line = line.splitlines(keepends=False)[0] | ||||
break | ||||
if readsize < 8000: | ||||
readsize *= 2 | ||||
return line | ||||
def readlines(self, sizehint=None, keepends=True): | ||||
""" Read all lines available on the input stream | ||||
and return them as list of lines. | ||||
Line breaks are implemented using the codec's decoder | ||||
method and are included in the list entries. | ||||
sizehint, if given, is ignored since there is no efficient | ||||
way to finding the true end-of-line. | ||||
""" | ||||
data = self.read() | ||||
return data.splitlines(keepends) | ||||
def reset(self): | ||||
""" Resets the codec buffers used for keeping state. | ||||
Note that no stream repositioning should take place. | ||||
This method is primarily intended to be able to recover | ||||
from decoding errors. | ||||
""" | ||||
self.bytebuffer = b"" | ||||
self.charbuffer = self._empty_charbuffer | ||||
self.linebuffer = None | ||||
def seek(self, offset, whence=0): | ||||
""" Set the input stream's current position. | ||||
Resets the codec buffers used for keeping state. | ||||
""" | ||||
self.stream.seek(offset, whence) | ||||
self.reset() | ||||
def __next__(self): | ||||
""" Return the next decoded line from the input stream.""" | ||||
line = self.readline() | ||||
if line: | ||||
return line | ||||
raise StopIteration | ||||
def __iter__(self): | ||||
return self | ||||
def __getattr__(self, name, | ||||
getattr=getattr): | ||||
""" Inherit all other methods from the underlying stream. | ||||
""" | ||||
return getattr(self.stream, name) | ||||
def __enter__(self): | ||||
return self | ||||
def __exit__(self, type, value, tb): | ||||
self.stream.close() | ||||
### | ||||
class StreamReaderWriter: | ||||
""" StreamReaderWriter instances allow wrapping streams which | ||||
work in both read and write modes. | ||||
The design is such that one can use the factory functions | ||||
returned by the codec.lookup() function to construct the | ||||
instance. | ||||
""" | ||||
# Optional attributes set by the file wrappers below | ||||
encoding = 'unknown' | ||||
def __init__(self, stream, Reader, Writer, errors='strict'): | ||||
""" Creates a StreamReaderWriter instance. | ||||
stream must be a Stream-like object. | ||||
Reader, Writer must be factory functions or classes | ||||
providing the StreamReader, StreamWriter interface resp. | ||||
Error handling is done in the same way as defined for the | ||||
StreamWriter/Readers. | ||||
""" | ||||
self.stream = stream | ||||
self.reader = Reader(stream, errors) | ||||
self.writer = Writer(stream, errors) | ||||
self.errors = errors | ||||
def read(self, size=-1): | ||||
return self.reader.read(size) | ||||
def readline(self, size=None): | ||||
return self.reader.readline(size) | ||||
def readlines(self, sizehint=None): | ||||
return self.reader.readlines(sizehint) | ||||
def __next__(self): | ||||
""" Return the next decoded line from the input stream.""" | ||||
return next(self.reader) | ||||
def __iter__(self): | ||||
return self | ||||
def write(self, data): | ||||
return self.writer.write(data) | ||||
def writelines(self, list): | ||||
return self.writer.writelines(list) | ||||
def reset(self): | ||||
self.reader.reset() | ||||
self.writer.reset() | ||||
def seek(self, offset, whence=0): | ||||
self.stream.seek(offset, whence) | ||||
self.reader.reset() | ||||
if whence == 0 and offset == 0: | ||||
self.writer.reset() | ||||
def __getattr__(self, name, | ||||
getattr=getattr): | ||||
""" Inherit all other methods from the underlying stream. | ||||
""" | ||||
return getattr(self.stream, name) | ||||
# these are needed to make "with codecs.open(...)" work properly | ||||
def __enter__(self): | ||||
return self | ||||
def __exit__(self, type, value, tb): | ||||
self.stream.close() | ||||
### | ||||
class StreamRecoder: | ||||
""" StreamRecoder instances provide a frontend - backend | ||||
view of encoding data. | ||||
They use the complete set of APIs returned by the | ||||
codecs.lookup() function to implement their task. | ||||
Data written to the stream is first decoded into an | ||||
intermediate format (which is dependent on the given codec | ||||
combination) and then written to the stream using an instance | ||||
of the provided Writer class. | ||||
In the other direction, data is read from the stream using a | ||||
Reader instance and then return encoded data to the caller. | ||||
""" | ||||
# Optional attributes set by the file wrappers below | ||||
data_encoding = 'unknown' | ||||
file_encoding = 'unknown' | ||||
def __init__(self, stream, encode, decode, Reader, Writer, | ||||
errors='strict'): | ||||
""" Creates a StreamRecoder instance which implements a two-way | ||||
conversion: encode and decode work on the frontend (the | ||||
input to .read() and output of .write()) while | ||||
Reader and Writer work on the backend (reading and | ||||
writing to the stream). | ||||
You can use these objects to do transparent direct | ||||
recodings from e.g. latin-1 to utf-8 and back. | ||||
stream must be a file-like object. | ||||
encode, decode must adhere to the Codec interface, Reader, | ||||
Writer must be factory functions or classes providing the | ||||
StreamReader, StreamWriter interface resp. | ||||
encode and decode are needed for the frontend translation, | ||||
Reader and Writer for the backend translation. Unicode is | ||||
used as intermediate encoding. | ||||
Error handling is done in the same way as defined for the | ||||
StreamWriter/Readers. | ||||
""" | ||||
self.stream = stream | ||||
self.encode = encode | ||||
self.decode = decode | ||||
self.reader = Reader(stream, errors) | ||||
self.writer = Writer(stream, errors) | ||||
self.errors = errors | ||||
def read(self, size=-1): | ||||
data = self.reader.read(size) | ||||
data, bytesencoded = self.encode(data, self.errors) | ||||
return data | ||||
def readline(self, size=None): | ||||
if size is None: | ||||
data = self.reader.readline() | ||||
else: | ||||
data = self.reader.readline(size) | ||||
data, bytesencoded = self.encode(data, self.errors) | ||||
return data | ||||
def readlines(self, sizehint=None): | ||||
data = self.reader.read() | ||||
data, bytesencoded = self.encode(data, self.errors) | ||||
return data.splitlines(keepends=True) | ||||
def __next__(self): | ||||
""" Return the next decoded line from the input stream.""" | ||||
data = next(self.reader) | ||||
data, bytesencoded = self.encode(data, self.errors) | ||||
return data | ||||
def __iter__(self): | ||||
return self | ||||
def write(self, data): | ||||
data, bytesdecoded = self.decode(data, self.errors) | ||||
return self.writer.write(data) | ||||
def writelines(self, list): | ||||
data = ''.join(list) | ||||
data, bytesdecoded = self.decode(data, self.errors) | ||||
return self.writer.write(data) | ||||
def reset(self): | ||||
self.reader.reset() | ||||
self.writer.reset() | ||||
def __getattr__(self, name, | ||||
getattr=getattr): | ||||
""" Inherit all other methods from the underlying stream. | ||||
""" | ||||
return getattr(self.stream, name) | ||||
def __enter__(self): | ||||
return self | ||||
def __exit__(self, type, value, tb): | ||||
self.stream.close() | ||||
### Shortcuts | ||||
def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): | ||||
""" Open an encoded file using the given mode and return | ||||
a wrapped version providing transparent encoding/decoding. | ||||
Note: The wrapped version will only accept the object format | ||||
defined by the codecs, i.e. Unicode objects for most builtin | ||||
codecs. Output is also codec dependent and will usually be | ||||
Unicode as well. | ||||
Files are always opened in binary mode, even if no binary mode | ||||
was specified. This is done to avoid data loss due to encodings | ||||
using 8-bit values. The default file mode is 'rb' meaning to | ||||
open the file in binary read mode. | ||||
encoding specifies the encoding which is to be used for the | ||||
file. | ||||
errors may be given to define the error handling. It defaults | ||||
to 'strict' which causes ValueErrors to be raised in case an | ||||
encoding error occurs. | ||||
buffering has the same meaning as for the builtin open() API. | ||||
It defaults to line buffered. | ||||
The returned wrapped file object provides an extra attribute | ||||
.encoding which allows querying the used encoding. This | ||||
attribute is only available if an encoding was specified as | ||||
parameter. | ||||
""" | ||||
if encoding is not None and \ | ||||
'b' not in mode: | ||||
# Force opening of the file in binary mode | ||||
mode = mode + 'b' | ||||
file = builtins.open(filename, mode, buffering) | ||||
if encoding is None: | ||||
return file | ||||
info = lookup(encoding) | ||||
srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) | ||||
# Add attributes to simplify introspection | ||||
srw.encoding = encoding | ||||
return srw | ||||
def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): | ||||
""" Return a wrapped version of file which provides transparent | ||||
encoding translation. | ||||
Strings written to the wrapped file are interpreted according | ||||
to the given data_encoding and then written to the original | ||||
file as string using file_encoding. The intermediate encoding | ||||
will usually be Unicode but depends on the specified codecs. | ||||
Strings are read from the file using file_encoding and then | ||||
passed back to the caller as string using data_encoding. | ||||
If file_encoding is not given, it defaults to data_encoding. | ||||
errors may be given to define the error handling. It defaults | ||||
to 'strict' which causes ValueErrors to be raised in case an | ||||
encoding error occurs. | ||||
The returned wrapped file object provides two extra attributes | ||||
.data_encoding and .file_encoding which reflect the given | ||||
parameters of the same name. The attributes can be used for | ||||
introspection by Python programs. | ||||
""" | ||||
if file_encoding is None: | ||||
file_encoding = data_encoding | ||||
data_info = lookup(data_encoding) | ||||
file_info = lookup(file_encoding) | ||||
sr = StreamRecoder(file, data_info.encode, data_info.decode, | ||||
file_info.streamreader, file_info.streamwriter, errors) | ||||
# Add attributes to simplify introspection | ||||
sr.data_encoding = data_encoding | ||||
sr.file_encoding = file_encoding | ||||
return sr | ||||
### Helpers for codec lookup | ||||
def getencoder(encoding): | ||||
""" Lookup up the codec for the given encoding and return | ||||
its encoder function. | ||||
Raises a LookupError in case the encoding cannot be found. | ||||
""" | ||||
return lookup(encoding).encode | ||||
def getdecoder(encoding): | ||||
""" Lookup up the codec for the given encoding and return | ||||
its decoder function. | ||||
Raises a LookupError in case the encoding cannot be found. | ||||
""" | ||||
return lookup(encoding).decode | ||||
def getincrementalencoder(encoding): | ||||
""" Lookup up the codec for the given encoding and return | ||||
its IncrementalEncoder class or factory function. | ||||
Raises a LookupError in case the encoding cannot be found | ||||
or the codecs doesn't provide an incremental encoder. | ||||
""" | ||||
encoder = lookup(encoding).incrementalencoder | ||||
if encoder is None: | ||||
raise LookupError(encoding) | ||||
return encoder | ||||
def getincrementaldecoder(encoding): | ||||
""" Lookup up the codec for the given encoding and return | ||||
its IncrementalDecoder class or factory function. | ||||
Raises a LookupError in case the encoding cannot be found | ||||
or the codecs doesn't provide an incremental decoder. | ||||
""" | ||||
decoder = lookup(encoding).incrementaldecoder | ||||
if decoder is None: | ||||
raise LookupError(encoding) | ||||
return decoder | ||||
def getreader(encoding): | ||||
""" Lookup up the codec for the given encoding and return | ||||
its StreamReader class or factory function. | ||||
Raises a LookupError in case the encoding cannot be found. | ||||
""" | ||||
return lookup(encoding).streamreader | ||||
def getwriter(encoding): | ||||
""" Lookup up the codec for the given encoding and return | ||||
its StreamWriter class or factory function. | ||||
Raises a LookupError in case the encoding cannot be found. | ||||
""" | ||||
return lookup(encoding).streamwriter | ||||
def iterencode(iterator, encoding, errors='strict', **kwargs): | ||||
""" | ||||
Encoding iterator. | ||||
Encodes the input strings from the iterator using a IncrementalEncoder. | ||||
errors and kwargs are passed through to the IncrementalEncoder | ||||
constructor. | ||||
""" | ||||
encoder = getincrementalencoder(encoding)(errors, **kwargs) | ||||
for input in iterator: | ||||
output = encoder.encode(input) | ||||
if output: | ||||
yield output | ||||
output = encoder.encode("", True) | ||||
if output: | ||||
yield output | ||||
def iterdecode(iterator, encoding, errors='strict', **kwargs): | ||||
""" | ||||
Decoding iterator. | ||||
Decodes the input strings from the iterator using a IncrementalDecoder. | ||||
errors and kwargs are passed through to the IncrementalDecoder | ||||
constructor. | ||||
""" | ||||
decoder = getincrementaldecoder(encoding)(errors, **kwargs) | ||||
for input in iterator: | ||||
output = decoder.decode(input) | ||||
if output: | ||||
yield output | ||||
output = decoder.decode(b"", True) | ||||
if output: | ||||
yield output | ||||
### Helpers for charmap-based codecs | ||||
def make_identity_dict(rng): | ||||
""" make_identity_dict(rng) -> dict | ||||
Return a dictionary where elements of the rng sequence are | ||||
mapped to themselves. | ||||
""" | ||||
return {i:i for i in rng} | ||||
def make_encoding_map(decoding_map): | ||||
""" Creates an encoding map from a decoding map. | ||||
If a target mapping in the decoding map occurs multiple | ||||
times, then that target is mapped to None (undefined mapping), | ||||
causing an exception when encountered by the charmap codec | ||||
during translation. | ||||
One example where this happens is cp875.py which decodes | ||||
multiple character to \u001a. | ||||
""" | ||||
m = {} | ||||
for k,v in decoding_map.items(): | ||||
if not v in m: | ||||
m[v] = k | ||||
else: | ||||
m[v] = None | ||||
return m | ||||
### error handlers | ||||
try: | ||||
strict_errors = lookup_error("strict") | ||||
ignore_errors = lookup_error("ignore") | ||||
replace_errors = lookup_error("replace") | ||||
xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") | ||||
backslashreplace_errors = lookup_error("backslashreplace") | ||||
except LookupError: | ||||
# In --disable-unicode builds, these error handler are missing | ||||
strict_errors = None | ||||
ignore_errors = None | ||||
replace_errors = None | ||||
xmlcharrefreplace_errors = None | ||||
backslashreplace_errors = None | ||||
# Tell modulefinder that using codecs probably needs the encodings | ||||
# package | ||||
_false = 0 | ||||
if _false: | ||||
import encodings | ||||
### Tests | ||||
if __name__ == '__main__': | ||||
# Make stdout translate Latin-1 output into UTF-8 output | ||||
sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') | ||||
# Have stdin translate Latin-1 input into UTF-8 input | ||||
sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') | ||||