Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

""" 

This module contains essential stuff that should've come with Python itself ;) 

 

It also contains functions (or functionality) which is in Python versions 

higher than 2.5 which used to be the lowest version supported by Scrapy. 

 

""" 

import os 

import re 

import inspect 

import weakref 

from functools import wraps 

from sgmllib import SGMLParser 

 

 

class FixedSGMLParser(SGMLParser): 

    """The SGMLParser that comes with Python has a bug in the convert_charref() 

    method. This is the same class with the bug fixed""" 

 

    def convert_charref(self, name): 

        """This method fixes a bug in Python's SGMLParser.""" 

        try: 

            n = int(name) 

        except ValueError: 

            return 

        if not 0 <= n <= 127 : # ASCII ends at 127, not 255 

            return 

        return self.convert_codepoint(n) 

 

 

def flatten(x): 

    """flatten(sequence) -> list 

 

    Returns a single, flat list which contains all elements retrieved 

    from the sequence and all recursively contained sub-sequences 

    (iterables). 

 

    Examples: 

    >>> [1, 2, [3,4], (5,6)] 

    [1, 2, [3, 4], (5, 6)] 

    >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)]) 

    [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]""" 

 

    result = [] 

    for el in x: 

        if hasattr(el, "__iter__"): 

            result.extend(flatten(el)) 

        else: 

            result.append(el) 

    return result 

 

 

exitdef unique(list_, key=lambda x: x): 

    """efficient function to uniquify a list preserving item order""" 

    seen = {} 

    result = [] 

    for item in list_: 

        seenkey = key(item) 

        if seenkey in seen: 

            continue 

        seen[seenkey] = 1 

        result.append(item) 

    return result 

 

 

def str_to_unicode(text, encoding=None, errors='strict'): 

    """Return the unicode representation of text in the given encoding. Unlike 

    .encode(encoding) this function can be applied directly to a unicode 

    object without the risk of double-decoding problems (which can happen if 

    you don't use the default 'ascii' encoding) 

    """ 

 

    if encoding is None: 

        encoding = 'utf-8' 

    if isinstance(text, str): 

        return text.decode(encoding, errors) 

    elif isinstance(text, unicode): 

        return text 

    else: 

        raise TypeError('str_to_unicode must receive a str or unicode object, got %s' % type(text).__name__) 

 

def unicode_to_str(text, encoding=None, errors='strict'): 

    """Return the str representation of text in the given encoding. Unlike 

    .encode(encoding) this function can be applied directly to a str 

    object without the risk of double-decoding problems (which can happen if 

    you don't use the default 'ascii' encoding) 

    """ 

 

    if encoding is None: 

        encoding = 'utf-8' 

    if isinstance(text, unicode): 

        return text.encode(encoding, errors) 

    elif isinstance(text, str): 

        return text 

    else: 

        raise TypeError('unicode_to_str must receive a unicode or str object, got %s' % type(text).__name__) 

 

def re_rsearch(pattern, text, chunk_size=1024): 

    """ 

    This function does a reverse search in a text using a regular expression 

    given in the attribute 'pattern'. 

    Since the re module does not provide this functionality, we have to find for 

    the expression into chunks of text extracted from the end (for the sake of efficiency). 

    At first, a chunk of 'chunk_size' kilobytes is extracted from the end, and searched for 

    the pattern. If the pattern is not found, another chunk is extracted, and another 

    search is performed. 

    This process continues until a match is found, or until the whole file is read. 

    In case the pattern wasn't found, None is returned, otherwise it returns a tuple containing 

    the start position of the match, and the ending (regarding the entire text). 

    """ 

    def _chunk_iter(): 

        offset = len(text) 

        while True: 

            offset -= (chunk_size * 1024) 

117            if offset <= 0: 

                break 

            yield (text[offset:], offset) 

        yield (text, 0) 

 

    pattern = re.compile(pattern) if isinstance(pattern, basestring) else pattern 

125    for chunk, offset in _chunk_iter(): 

        matches = [match for match in pattern.finditer(chunk)] 

121        if matches: 

            return (offset + matches[-1].span()[0], offset + matches[-1].span()[1]) 

    return None 

 

def memoizemethod_noargs(method): 

    """Decorator to cache the result of a method (without arguments) using a 

    weak reference to its object 

    """ 

    cache = weakref.WeakKeyDictionary() 

    @wraps(method) 

    def new_method(self, *args, **kwargs): 

        if self not in cache: 

            cache[self] = method(self, *args, **kwargs) 

        return cache[self] 

    return new_method 

 

_BINARYCHARS = set(map(chr, range(32))) - set(["\0", "\t", "\n", "\r"]) 

 

def isbinarytext(text): 

    """Return True if the given text is considered binary, or false 

    otherwise, by looking for binary bytes at their chars 

    """ 

    assert isinstance(text, str), "text must be str, got '%s'" % type(text).__name__ 

    return any(c in _BINARYCHARS for c in text) 

 

def get_func_args(func): 

    """Return the argument name list of a callable""" 

    if inspect.isfunction(func): 

        func_args, _, _, _ = inspect.getargspec(func) 

158    elif hasattr(func, '__call__'): 

        try: 

            func_args, _, _, _ = inspect.getargspec(func.__call__) 

        except Exception: 

            func_args = [] 

    else: 

        raise TypeError('%s is not callable' % type(func)) 

    return func_args 

 

def equal_attributes(obj1, obj2, attributes): 

    """Compare two objects attributes""" 

    # not attributes given return False by default 

    if not attributes: 

        return False 

 

    for attr in attributes: 

        # support callables like itemgetter 

        if callable(attr): 

            if not attr(obj1) == attr(obj2): 

                return False 

        else: 

            # check that objects has attribute 

            if not hasattr(obj1, attr): 

                return False 

177            if not hasattr(obj2, attr): 

                return False 

            # compare object attributes 

            if not getattr(obj1, attr) == getattr(obj2, attr): 

                return False 

    # all attributes equal 

    return True 

 

 

class WeakKeyCache(object): 

 

    def __init__(self, default_factory): 

        self.default_factory = default_factory 

        self._weakdict = weakref.WeakKeyDictionary() 

 

    def __getitem__(self, key): 

        if key not in self._weakdict: 

            self._weakdict[key] = self.default_factory(key) 

        return self._weakdict[key] 

 

 

def stringify_dict(dct_or_tuples, encoding='utf-8', keys_only=True): 

    """Return a (new) dict with the unicode keys (and values if, keys_only is 

    False) of the given dict converted to strings. `dct_or_tuples` can be a 

    dict or a list of tuples, like any dict constructor supports. 

    """ 

    d = {} 

    for k, v in dict(dct_or_tuples).iteritems(): 

        k = k.encode(encoding) if isinstance(k, unicode) else k 

        if not keys_only: 

            v = v.encode(encoding) if isinstance(v, unicode) else v 

        d[k] = v 

    return d 

 

def is_writable(path): 

    """Return True if the given path can be written (if it exists) or created 

    (if it doesn't exist) 

    """ 

    if os.path.exists(path): 

        return os.access(path, os.W_OK) 

    else: 

        return os.access(os.path.dirname(path), os.W_OK) 

 

def setattr_default(obj, name, value): 

    """Set attribute value, but only if it's not already set. Similar to 

    setdefault() for dicts. 

    """ 

    if not hasattr(obj, name): 

        setattr(obj, name, value)