Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

""" 

Url Length Spider Middleware 

 

See documentation in docs/topics/spider-middleware.rst 

""" 

 

from scrapy import log 

from scrapy.http import Request 

from scrapy.exceptions import NotConfigured 

 

class UrlLengthMiddleware(object): 

 

    def __init__(self, maxlength): 

        self.maxlength = maxlength 

 

    @classmethod 

    def from_settings(cls, settings): 

        maxlength = settings.getint('URLLENGTH_LIMIT') 

20        if not maxlength: 

            raise NotConfigured 

        return cls(maxlength) 

 

    def process_spider_output(self, response, result, spider): 

        def _filter(request): 

            if isinstance(request, Request) and len(request.url) > self.maxlength: 

                log.msg("Ignoring link (url length > %d): %s " % (self.maxlength, request.url), \ 

                    level=log.DEBUG, spider=spider) 

                return False 

            else: 

                return True 

 

        return (r for r in result or () if _filter(r))