Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

""" 

This module implements the XMLFeedSpider which is the recommended spider to use 

for scraping from an XML feed. 

 

See documentation in docs/topics/spiders.rst 

""" 

 

from scrapy.spider import BaseSpider 

from scrapy.item import BaseItem 

from scrapy.http import Request 

from scrapy.utils.iterators import xmliter, csviter 

from scrapy.selector import XmlXPathSelector, HtmlXPathSelector 

from scrapy.exceptions import NotConfigured, NotSupported 

 

class XMLFeedSpider(BaseSpider): 

    """ 

    This class intends to be the base class for spiders that scrape 

    from XML feeds. 

 

    You can choose whether to parse the file using the 'iternodes' iterator, an 

    'xml' selector, or an 'html' selector.  In most cases, it's convenient to 

    use iternodes, since it's a faster and cleaner. 

    """ 

 

    iterator = 'iternodes' 

    itertag = 'item' 

    namespaces = () 

 

    def process_results(self, response, results): 

        """This overridable method is called for each result (item or request) 

        returned by the spider, and it's intended to perform any last time 

        processing required before returning the results to the framework core, 

        for example setting the item GUIDs. It receives a list of results and 

        the response which originated that results. It must return a list of 

        results (Items or Requests). 

        """ 

        return results 

 

    def adapt_response(self, response): 

        """You can override this function in order to make any changes you want 

        to into the feed before parsing it. This function must return a 

        response. 

        """ 

        return response 

 

    def parse_node(self, response, selector): 

        """This method must be overriden with your custom spider functionality""" 

        if hasattr(self, 'parse_item'): # backward compatibility 

            return self.parse_item(response, selector) 

        raise NotImplementedError 

 

    def parse_nodes(self, response, nodes): 

        """This method is called for the nodes matching the provided tag name 

        (itertag). Receives the response and an XPathSelector for each node. 

        Overriding this method is mandatory. Otherwise, you spider won't work. 

        This method must return either a BaseItem, a Request, or a list 

        containing any of them. 

        """ 

 

        for selector in nodes: 

            ret = self.parse_node(response, selector) 

            if isinstance(ret, (BaseItem, Request)): 

                ret = [ret] 

            if not isinstance(ret, (list, tuple)): 

                raise TypeError('You cannot return an "%s" object from a spider' % type(ret).__name__) 

            for result_item in self.process_results(response, ret): 

                yield result_item 

 

    def parse(self, response): 

        if not hasattr(self, 'parse_node'): 

            raise NotConfigured('You must define parse_node method in order to scrape this XML feed') 

 

        response = self.adapt_response(response) 

        if self.iterator == 'iternodes': 

            nodes = xmliter(response, self.itertag) 

        elif self.iterator == 'xml': 

            selector = XmlXPathSelector(response) 

            self._register_namespaces(selector) 

            nodes = selector.select('//%s' % self.itertag) 

        elif self.iterator == 'html': 

            selector = HtmlXPathSelector(response) 

            self._register_namespaces(selector) 

            nodes = selector.select('//%s' % self.itertag) 

        else: 

            raise NotSupported('Unsupported node iterator') 

 

        return self.parse_nodes(response, nodes) 

 

    def _register_namespaces(self, selector): 

        for (prefix, uri) in self.namespaces: 

            selector.register_namespace(prefix, uri) 

 

class CSVFeedSpider(BaseSpider): 

    """Spider for parsing CSV feeds. 

    It receives a CSV file in a response; iterates through each of its rows, 

    and calls parse_row with a dict containing each field's data. 

 

    You can set some options regarding the CSV file, such as the delimiter 

    and the file's headers. 

    """ 

 

    delimiter = None # When this is None, python's csv module's default delimiter is used 

    headers = None 

 

    def process_results(self, response, results): 

        """This method has the same purpose as the one in XMLFeedSpider""" 

        return results 

 

    def adapt_response(self, response): 

        """This method has the same purpose as the one in XMLFeedSpider""" 

        return response 

 

    def parse_row(self, response, row): 

        """This method must be overriden with your custom spider functionality""" 

        raise NotImplementedError 

 

    def parse_rows(self, response): 

        """Receives a response and a dict (representing each row) with a key for 

        each provided (or detected) header of the CSV file.  This spider also 

        gives the opportunity to override adapt_response and 

        process_results methods for pre and post-processing purposes. 

        """ 

 

        for row in csviter(response, self.delimiter, self.headers): 

            ret = self.parse_row(response, row) 

            if isinstance(ret, (BaseItem, Request)): 

                ret = [ret] 

            if not isinstance(ret, (list, tuple)): 

                raise TypeError('You cannot return an "%s" object from a spider' % type(ret).__name__) 

            for result_item in self.process_results(response, ret): 

                yield result_item 

 

    def parse(self, response): 

        if not hasattr(self, 'parse_row'): 

            raise NotConfigured('You must define parse_row method in order to scrape this CSV feed') 

        response = self.adapt_response(response) 

        return self.parse_rows(response)