Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

""" 

An extension to retry failed requests that are potentially caused by temporary 

problems such as a connection timeout or HTTP 500 error. 

 

You can change the behaviour of this middleware by modifing the scraping settings: 

RETRY_TIMES - how many times to retry a failed page 

RETRY_HTTP_CODES - which HTTP response codes to retry 

 

Failed pages are collected on the scraping process and rescheduled at the end, 

once the spider has finished crawling all regular (non failed) pages. Once 

there is no more failed pages to retry this middleware sends a signal 

(retry_complete), so other extensions could connect to that signal. 

 

About HTTP errors to consider: 

 

- You may want to remove 400 from RETRY_HTTP_CODES, if you stick to the HTTP 

  protocol. It's included by default because it's a common code used to 

  indicate server overload, which would be something we want to retry 

""" 

 

from twisted.internet.error import TimeoutError as ServerTimeoutError, DNSLookupError, \ 

                                   ConnectionRefusedError, ConnectionDone, ConnectError, \ 

                                   ConnectionLost, TCPTimedOutError 

from twisted.internet.defer import TimeoutError as UserTimeoutError 

 

from scrapy import log 

from scrapy.exceptions import NotConfigured 

from scrapy.utils.response import response_status_message 

from scrapy.conf import settings 

 

class RetryMiddleware(object): 

 

    # IOError is raised by the HttpCompression middleware when trying to 

    # decompress an empty response 

    EXCEPTIONS_TO_RETRY = (ServerTimeoutError, UserTimeoutError, DNSLookupError, 

                           ConnectionRefusedError, ConnectionDone, ConnectError, 

                           ConnectionLost, TCPTimedOutError, 

                           IOError) 

 

    def __init__(self): 

42        if not settings.getbool('RETRY_ENABLED'): 

            raise NotConfigured 

        self.max_retry_times = settings.getint('RETRY_TIMES') 

        self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES')) 

        self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST') 

 

    def process_response(self, request, response, spider): 

        if 'dont_retry' in request.meta: 

            return response 

        if response.status in self.retry_http_codes: 

            reason = response_status_message(response.status) 

            return self._retry(request, reason, spider) or response 

        return response 

 

    def process_exception(self, request, exception, spider): 

        if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \ 

                and 'dont_retry' not in request.meta: 

            return self._retry(request, exception, spider) 

 

    def _retry(self, request, reason, spider): 

        retries = request.meta.get('retry_times', 0) + 1 

 

        if retries <= self.max_retry_times: 

            log.msg("Retrying %s (failed %d times): %s" % (request, retries, reason), 

                    spider=spider, level=log.DEBUG) 

            retryreq = request.copy() 

            retryreq.meta['retry_times'] = retries 

            retryreq.dont_filter = True 

            retryreq.priority = request.priority + self.priority_adjust 

            return retryreq 

        else: 

            log.msg("Gave up retrying %s (failed %d times): %s" % (request, retries, reason), 

                    spider=spider, level=log.DEBUG)