Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

import os 

from os.path import join, exists 

from time import time 

import cPickle as pickle 

 

from w3lib.http import headers_dict_to_raw, headers_raw_to_dict 

 

from scrapy.xlib.pydispatch import dispatcher 

from scrapy import signals 

from scrapy.http import Headers 

from scrapy.exceptions import NotConfigured, IgnoreRequest 

from scrapy.stats import stats 

from scrapy.responsetypes import responsetypes 

from scrapy.utils.request import request_fingerprint 

from scrapy.utils.httpobj import urlparse_cached 

from scrapy.utils.misc import load_object 

from scrapy.utils.project import data_path 

from scrapy import conf 

 

 

class HttpCacheMiddleware(object): 

 

    def __init__(self, settings=conf.settings): 

        if not settings.getbool('HTTPCACHE_ENABLED'): 

            raise NotConfigured 

        self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings) 

        self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING') 

        self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES') 

        self.ignore_http_codes = map(int, settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')) 

        dispatcher.connect(self.spider_opened, signal=signals.spider_opened) 

        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 

 

    def spider_opened(self, spider): 

        self.storage.open_spider(spider) 

 

    def spider_closed(self, spider): 

        self.storage.close_spider(spider) 

 

    def process_request(self, request, spider): 

        if not self.is_cacheable(request): 

            return 

        response = self.storage.retrieve_response(spider, request) 

        if response and self.is_cacheable_response(response): 

            response.flags.append('cached') 

            stats.inc_value('httpcache/hit', spider=spider) 

            return response 

 

        stats.inc_value('httpcache/miss', spider=spider) 

        if self.ignore_missing: 

            raise IgnoreRequest("Ignored request not in cache: %s" % request) 

 

    def process_response(self, request, response, spider): 

        if (self.is_cacheable(request) 

            and self.is_cacheable_response(response) 

            and 'cached' not in response.flags): 

            self.storage.store_response(spider, request, response) 

            stats.inc_value('httpcache/store', spider=spider) 

        return response 

 

    def is_cacheable_response(self, response): 

        return response.status not in self.ignore_http_codes 

 

    def is_cacheable(self, request): 

        return urlparse_cached(request).scheme not in self.ignore_schemes 

 

 

class FilesystemCacheStorage(object): 

 

    def __init__(self, settings=conf.settings): 

        self.cachedir = data_path(settings['HTTPCACHE_DIR']) 

        self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS') 

 

    def open_spider(self, spider): 

        pass 

 

    def close_spider(self, spider): 

        pass 

 

    def retrieve_response(self, spider, request): 

        """Return response if present in cache, or None otherwise.""" 

        metadata = self._read_meta(spider, request) 

        if metadata is None: 

            return # not cached 

        rpath = self._get_request_path(spider, request) 

        with open(join(rpath, 'response_body'), 'rb') as f: 

            body = f.read() 

        with open(join(rpath, 'response_headers'), 'rb') as f: 

            rawheaders = f.read() 

        url = metadata.get('response_url') 

        status = metadata['status'] 

        headers = Headers(headers_raw_to_dict(rawheaders)) 

        respcls = responsetypes.from_args(headers=headers, url=url) 

        response = respcls(url=url, headers=headers, status=status, body=body) 

        return response 

 

    def store_response(self, spider, request, response): 

        """Store the given response in the cache.""" 

        rpath = self._get_request_path(spider, request) 

101        if not exists(rpath): 

            os.makedirs(rpath) 

        metadata = { 

            'url': request.url, 

            'method': request.method, 

            'status': response.status, 

            'response_url': response.url, 

            'timestamp': time(), 

        } 

        with open(join(rpath, 'meta'), 'wb') as f: 

            f.write(repr(metadata)) 

        with open(join(rpath, 'pickled_meta'), 'wb') as f: 

            pickle.dump(metadata, f, protocol=2) 

        with open(join(rpath, 'response_headers'), 'wb') as f: 

            f.write(headers_dict_to_raw(response.headers)) 

        with open(join(rpath, 'response_body'), 'wb') as f: 

            f.write(response.body) 

        with open(join(rpath, 'request_headers'), 'wb') as f: 

            f.write(headers_dict_to_raw(request.headers)) 

        with open(join(rpath, 'request_body'), 'wb') as f: 

            f.write(request.body) 

 

    def _get_request_path(self, spider, request): 

        key = request_fingerprint(request) 

        return join(self.cachedir, spider.name, key[0:2], key) 

 

    def _read_meta(self, spider, request): 

        rpath = self._get_request_path(spider, request) 

        metapath = join(rpath, 'pickled_meta') 

        if not exists(metapath): 

            return # not found 

        mtime = os.stat(rpath).st_mtime 

        if 0 < self.expiration_secs < time() - mtime: 

            return # expired 

        with open(metapath, 'rb') as f: 

            return pickle.load(f)