Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

""" 

This module contains data types used by Scrapy which are not included in the 

Python Standard Library. 

 

This module must not depend on any module outside the Standard Library. 

""" 

 

import copy 

from collections import deque, defaultdict 

from itertools import chain 

 

from scrapy.utils.py27 import OrderedDict 

 

 

class MultiValueDictKeyError(KeyError): 

    pass 

 

class MultiValueDict(dict): 

    """ 

    A subclass of dictionary customized to handle multiple values for the same key. 

 

    >>> d = MultiValueDict({'name': ['Adrian', 'Simon'], 'position': ['Developer']}) 

    >>> d['name'] 

    'Simon' 

    >>> d.getlist('name') 

    ['Adrian', 'Simon'] 

    >>> d.get('lastname', 'nonexistent') 

    'nonexistent' 

    >>> d.setlist('lastname', ['Holovaty', 'Willison']) 

 

    This class exists to solve the irritating problem raised by cgi.parse_qs, 

    which returns a list for every key, even though most Web forms submit 

    single name-value pairs. 

    """ 

    def __init__(self, key_to_list_mapping=()): 

        dict.__init__(self, key_to_list_mapping) 

 

    def __repr__(self): 

        return "<%s: %s>" % (self.__class__.__name__, dict.__repr__(self)) 

 

    def __getitem__(self, key): 

        """ 

        Returns the last data value for this key, or [] if it's an empty list; 

        raises KeyError if not found. 

        """ 

        try: 

            list_ = dict.__getitem__(self, key) 

        except KeyError: 

            raise MultiValueDictKeyError, "Key %r not found in %r" % (key, self) 

        try: 

            return list_[-1] 

        except IndexError: 

            return [] 

 

    def __setitem__(self, key, value): 

        dict.__setitem__(self, key, [value]) 

 

    def __copy__(self): 

        return self.__class__(dict.items(self)) 

 

    def __deepcopy__(self, memo=None): 

        if memo is None: 

            memo = {} 

        result = self.__class__() 

        memo[id(self)] = result 

        for key, value in dict.items(self): 

            dict.__setitem__(result, copy.deepcopy(key, memo), copy.deepcopy(value, memo)) 

        return result 

 

    def get(self, key, default=None): 

        "Returns the default value if the requested data doesn't exist" 

        try: 

            val = self[key] 

        except KeyError: 

            return default 

        if val == []: 

            return default 

        return val 

 

    def getlist(self, key): 

        "Returns an empty list if the requested data doesn't exist" 

        try: 

            return dict.__getitem__(self, key) 

        except KeyError: 

            return [] 

 

    def setlist(self, key, list_): 

        dict.__setitem__(self, key, list_) 

 

    def setdefault(self, key, default=None): 

        if key not in self: 

            self[key] = default 

        return self[key] 

 

    def setlistdefault(self, key, default_list=()): 

        if key not in self: 

            self.setlist(key, default_list) 

        return self.getlist(key) 

 

    def appendlist(self, key, value): 

        "Appends an item to the internal list associated with key" 

        self.setlistdefault(key, []) 

        dict.__setitem__(self, key, self.getlist(key) + [value]) 

 

    def items(self): 

        """ 

        Returns a list of (key, value) pairs, where value is the last item in 

        the list associated with the key. 

        """ 

        return [(key, self[key]) for key in self.keys()] 

 

    def lists(self): 

        "Returns a list of (key, list) pairs." 

        return dict.items(self) 

 

    def values(self): 

        "Returns a list of the last value on every key list." 

        return [self[key] for key in self.keys()] 

 

    def copy(self): 

        "Returns a copy of this object." 

        return self.__deepcopy__() 

 

    def update(self, *args, **kwargs): 

        "update() extends rather than replaces existing key lists. Also accepts keyword args." 

        if len(args) > 1: 

            raise TypeError, "update expected at most 1 arguments, got %d" % len(args) 

        if args: 

            other_dict = args[0] 

            if isinstance(other_dict, MultiValueDict): 

                for key, value_list in other_dict.lists(): 

                    self.setlistdefault(key, []).extend(value_list) 

            else: 

                try: 

                    for key, value in other_dict.items(): 

                        self.setlistdefault(key, []).append(value) 

                except TypeError: 

                    raise ValueError, "MultiValueDict.update() takes either a MultiValueDict or dictionary" 

        for key, value in kwargs.iteritems(): 

            self.setlistdefault(key, []).append(value) 

 

class SiteNode(object): 

    """Class to represent a site node (page, image or any other file)""" 

 

    def __init__(self, url): 

        self.url = url 

        self.itemnames = [] 

        self.children = [] 

        self.parent = None 

 

    def add_child(self, node): 

        self.children.append(node) 

        node.parent = self 

 

    def to_string(self, level=0): 

        s = "%s%s\n" % ('  '*level, self.url) 

        if self.itemnames: 

            for n in self.itemnames: 

                s += "%sScraped: %s\n" % ('  '*(level+1), n) 

        for node in self.children: 

            s += node.to_string(level+1) 

        return s 

 

 

class CaselessDict(dict): 

 

    __slots__ = () 

 

    def __init__(self, seq=None): 

        super(CaselessDict, self).__init__() 

        if seq: 

            self.update(seq) 

 

    def __getitem__(self, key): 

        return dict.__getitem__(self, self.normkey(key)) 

 

    def __setitem__(self, key, value): 

        dict.__setitem__(self, self.normkey(key), self.normvalue(value)) 

 

    def __delitem__(self, key): 

        dict.__delitem__(self, self.normkey(key)) 

 

    def __contains__(self, key): 

        return dict.__contains__(self, self.normkey(key)) 

    has_key = __contains__ 

 

    def __copy__(self): 

        return self.__class__(self) 

    copy = __copy__ 

 

    def normkey(self, key): 

        """Method to normalize dictionary key access""" 

        return key.lower() 

 

    def normvalue(self, value): 

        """Method to normalize values prior to be setted""" 

        return value 

 

    def get(self, key, def_val=None): 

        return dict.get(self, self.normkey(key), self.normvalue(def_val)) 

 

    def setdefault(self, key, def_val=None): 

        return dict.setdefault(self, self.normkey(key), self.normvalue(def_val)) 

 

    def update(self, seq): 

        seq = seq.iteritems() if isinstance(seq, dict) else seq 

        iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq) 

        super(CaselessDict, self).update(iseq) 

 

    @classmethod 

    def fromkeys(cls, keys, value=None): 

        return cls((k, value) for k in keys) 

 

    def pop(self, key, *args): 

        return dict.pop(self, self.normkey(key), *args) 

 

 

class MergeDict(object): 

    """ 

    A simple class for creating new "virtual" dictionaries that actually look 

    up values in more than one dictionary, passed in the constructor. 

 

    If a key appears in more than one of the given dictionaries, only the 

    first occurrence will be used. 

    """ 

    def __init__(self, *dicts): 

        self.dicts = dicts 

 

    def __getitem__(self, key): 

235        for dict_ in self.dicts: 

            try: 

                return dict_[key] 

            except KeyError: 

                pass 

        raise KeyError 

 

    def __copy__(self): 

        return self.__class__(*self.dicts) 

 

    def get(self, key, default=None): 

        try: 

            return self[key] 

        except KeyError: 

            return default 

 

    def getlist(self, key): 

        for dict_ in self.dicts: 

            if key in dict_.keys(): 

                return dict_.getlist(key) 

        return [] 

 

    def items(self): 

        item_list = [] 

        for dict_ in self.dicts: 

            item_list.extend(dict_.items()) 

        return item_list 

 

    def has_key(self, key): 

262        for dict_ in self.dicts: 

            if key in dict_: 

                return True 

        return False 

 

    __contains__ = has_key 

 

    def copy(self): 

        """Returns a copy of this object.""" 

        return self.__copy__() 

 

 

class LocalCache(OrderedDict): 

    """Dictionary with a finite number of keys. 

 

    Older items expires first. 

 

    """ 

 

    def __init__(self, limit=None): 

        super(LocalCache, self).__init__() 

        self.limit = limit 

 

    def __setitem__(self, key, value): 

        while len(self) >= self.limit: 

            self.popitem(last=False) 

        super(LocalCache, self).__setitem__(key, value)