Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

""" 

This modules implements the CrawlSpider which is the recommended spider to use 

for scraping typical web sites that requires crawling pages. 

 

See documentation in docs/topics/spiders.rst 

""" 

 

import copy 

 

from scrapy.http import Request, HtmlResponse 

from scrapy.utils.spider import iterate_spider_output 

from scrapy.spider import BaseSpider 

from scrapy.conf import settings 

 

def identity(x): 

    return x 

 

class Rule(object): 

 

    def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity): 

        self.link_extractor = link_extractor 

        self.callback = callback 

        self.cb_kwargs = cb_kwargs or {} 

        self.process_links = process_links 

        self.process_request = process_request 

        if follow is None: 

            self.follow = False if callback else True 

        else: 

            self.follow = follow 

 

class CrawlSpider(BaseSpider): 

 

    rules = () 

 

    def __init__(self, *a, **kw): 

        super(CrawlSpider, self).__init__(*a, **kw) 

        self._compile_rules() 

 

    def parse(self, response): 

        return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True) 

 

    def parse_start_url(self, response): 

        return [] 

 

    def process_results(self, response, results): 

        return results 

 

    def _requests_to_follow(self, response): 

        if not isinstance(response, HtmlResponse): 

            return 

        seen = set() 

        for n, rule in enumerate(self._rules): 

            links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] 

            if links and rule.process_links: 

                links = rule.process_links(links) 

            seen = seen.union(links) 

            for link in links: 

                r = Request(url=link.url, callback=self._response_downloaded) 

                r.meta.update(rule=n, link_text=link.text) 

                yield rule.process_request(r) 

 

    def _response_downloaded(self, response): 

        rule = self._rules[response.meta['rule']] 

        return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow) 

 

    def _parse_response(self, response, callback, cb_kwargs, follow=True): 

        if callback: 

            cb_res = callback(response, **cb_kwargs) or () 

            cb_res = self.process_results(response, cb_res) 

            for requests_or_item in iterate_spider_output(cb_res): 

                yield requests_or_item 

 

        if follow and settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True): 

            for request_or_item in self._requests_to_follow(response): 

                yield request_or_item 

 

 

    def _compile_rules(self): 

        def get_method(method): 

            if callable(method): 

                return method 

            elif isinstance(method, basestring): 

                return getattr(self, method, None) 

 

        self._rules = [copy.copy(r) for r in self.rules] 

87        for rule in self._rules: 

            rule.callback = get_method(rule.callback) 

            rule.process_links = get_method(rule.process_links) 

            rule.process_request = get_method(rule.process_request)