Coverage for scrapy/http/response/dammit : 69%
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
|
""" This module contains a fork of the UnicodeDammit class from BeautifulSoup, that expliclty disabled any usage of chardet library.
The UnicodeDammit class is used as a last resource for detecting the encoding of a response. """
"""A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is windows-1252, can replace MS smart quotes with their HTML or XML equivalents."""
# This dictionary maps commonly seen values for "charset" in HTML # meta tags to the corresponding Python codec names. It only covers # values that aren't in Python's aliases and can't be determined # by the heuristics in find_codec. "x-sjis" : "shift-jis" }
smartQuotesTo='xml', isHTML=False): self._detectEncoding(markup, isHTML)
# If no luck and we have auto-detection library, try that: u = self._convertFrom(chardet.detect(self.markup)['encoding'])
# As a last resort, try utf-8 and windows-1252:
"""Changes a MS smart quote character to an XML or HTML entity.""" else: sub = '&%s;' % sub[0]
# Convert smart quotes to HTML if coming from an encoding # that might have them. "iso-8859-1", "iso-8859-2"): (lambda(x): self._subMSChar(x.group(1)), markup)
# print "Trying to convert document to %s" % proposed # print "That didn't work!" # print e #print "Correct encoding: %s" % proposed
'''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases'''
# strip Byte Order Mark (if present) and (data[2:4] != '\x00\x00'): encoding = 'utf-16be' data = data[2:] and (data[2:4] != '\x00\x00'): encoding = 'utf-16le' data = data[2:] encoding = 'utf-8' data = data[3:] encoding = 'utf-32be' data = data[4:] encoding = 'utf-32le' data = data[4:]
"""Given a document, tries to detect its XML encoding.""" # EBCDIC xml_data = self._ebcdic_to_ascii(xml_data) # UTF-16BE sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') and (xml_data[2:4] != '\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') # UTF-16LE sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') (xml_data[2:4] != '\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') # UTF-32BE sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') # UTF-32LE sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') # UTF-32BE with BOM sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') # UTF-32LE with BOM sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') # UTF-8 with BOM sniffed_xml_encoding = 'utf-8' xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') else: except: xml_encoding_match = None '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I) xml_encoding_match = regexp.search(xml_data) self.declaredHTMLEncoding = xml_encoding (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): xml_encoding = sniffed_xml_encoding
or (charset and self._codec(charset.replace("-", ""))) \ or (charset and self._codec(charset.replace("-", "_"))) \ or charset
except (LookupError, ValueError): pass
c = self.__class__ if not c.EBCDIC_TO_ASCII_MAP: emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 250,251,252,253,254,255) import string c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ ''.join(map(chr, range(256))), ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP)
'\x81' : ' ', '\x82' : ('sbquo', '201A'), '\x83' : ('fnof', '192'), '\x84' : ('bdquo', '201E'), '\x85' : ('hellip', '2026'), '\x86' : ('dagger', '2020'), '\x87' : ('Dagger', '2021'), '\x88' : ('circ', '2C6'), '\x89' : ('permil', '2030'), '\x8A' : ('Scaron', '160'), '\x8B' : ('lsaquo', '2039'), '\x8C' : ('OElig', '152'), '\x8D' : '?', '\x8E' : ('#x17D', '17D'), '\x8F' : '?', '\x90' : '?', '\x91' : ('lsquo', '2018'), '\x92' : ('rsquo', '2019'), '\x93' : ('ldquo', '201C'), '\x94' : ('rdquo', '201D'), '\x95' : ('bull', '2022'), '\x96' : ('ndash', '2013'), '\x97' : ('mdash', '2014'), '\x98' : ('tilde', '2DC'), '\x99' : ('trade', '2122'), '\x9a' : ('scaron', '161'), '\x9b' : ('rsaquo', '203A'), '\x9c' : ('oelig', '153'), '\x9d' : '?', '\x9e' : ('#x17E', '17E'), '\x9f' : ('Yuml', ''),}
#######################################################################
|