Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

""" 

Item Loader 

 

See documentation in docs/topics/loaders.rst 

""" 

 

from collections import defaultdict 

import re 

 

from scrapy.item import Item 

from scrapy.selector import HtmlXPathSelector 

from scrapy.utils.misc import arg_to_iter, extract_regex 

from scrapy.utils.python import flatten 

from .common import wrap_loader_context 

from .processor import Identity 

 

class ItemLoader(object): 

 

    default_item_class = Item 

    default_input_processor = Identity() 

    default_output_processor = Identity() 

 

    def __init__(self, item=None, **context): 

        if item is None: 

            item = self.default_item_class() 

        self.item = context['item'] = item 

        self.context = context 

        self._values = defaultdict(list) 

 

    def add_value(self, field_name, value, *processors, **kw): 

        value = self.get_value(value, *processors, **kw) 

33        if value is None: 

            return 

        if not field_name: 

            for k,v in value.iteritems(): 

                self._add_value(k, v) 

        else: 

            self._add_value(field_name, value) 

 

    def replace_value(self, field_name, value, *processors, **kw): 

        value = self.get_value(value, *processors, **kw) 

43        if value is None: 

            return 

        if not field_name: 

            for k,v in value.iteritems(): 

                self._replace_value(k, v) 

        else: 

            self._replace_value(field_name, value) 

 

    def _add_value(self, field_name, value): 

        value = arg_to_iter(value) 

        processed_value = self._process_input_value(field_name, value) 

exit        if processed_value: 

            self._values[field_name] += arg_to_iter(processed_value) 

 

    def _replace_value(self, field_name, value): 

        self._values.pop(field_name, None) 

        self._add_value(field_name, value) 

 

    def get_value(self, value, *processors, **kw): 

        regex = kw.get('re', None) 

        if regex: 

            value = arg_to_iter(value) 

            value = flatten([extract_regex(regex, x) for x in value]) 

 

        for proc in processors: 

68            if value is None: 

                break 

            proc = wrap_loader_context(proc, self.context) 

            value = proc(value) 

        return value 

 

    def load_item(self): 

        item = self.item 

        for field_name in self._values: 

            item[field_name] = self.get_output_value(field_name) 

        return item 

 

    def get_output_value(self, field_name): 

        proc = self.get_output_processor(field_name) 

        proc = wrap_loader_context(proc, self.context) 

        try: 

            return proc(self._values[field_name]) 

        except Exception, e: 

            raise ValueError("Error with output processor: field=%r value=%r error='%s: %s'" % \ 

                (field_name, self._values[field_name], type(e).__name__, str(e))) 

 

    def get_collected_values(self, field_name): 

        return self._values[field_name] 

 

    def get_input_processor(self, field_name): 

        proc = getattr(self, '%s_in' % field_name, None) 

        if not proc: 

            proc = self._get_item_field_attr(field_name, 'input_processor', \ 

                self.default_input_processor) 

        return proc 

 

    def get_output_processor(self, field_name): 

        proc = getattr(self, '%s_out' % field_name, None) 

        if not proc: 

            proc = self._get_item_field_attr(field_name, 'output_processor', \ 

                self.default_output_processor) 

        return proc 

 

    def _process_input_value(self, field_name, value): 

        proc = self.get_input_processor(field_name) 

        proc = wrap_loader_context(proc, self.context) 

        return proc(value) 

 

    def _get_item_field_attr(self, field_name, key, default=None): 

114        if isinstance(self.item, Item): 

            value = self.item.fields[field_name].get(key, default) 

        else: 

            value = default 

        return value 

 

class XPathItemLoader(ItemLoader): 

 

    default_selector_class = HtmlXPathSelector 

 

    def __init__(self, item=None, selector=None, response=None, **context): 

        if selector is None and response is None: 

            raise RuntimeError("%s must be instantiated with a selector " \ 

                "or response" % self.__class__.__name__) 

        if selector is None: 

            selector = self.default_selector_class(response) 

        self.selector = selector 

        context.update(selector=selector, response=response) 

        super(XPathItemLoader, self).__init__(item, **context) 

 

    def add_xpath(self, field_name, xpath, *processors, **kw): 

        values = self._get_values(xpath, **kw) 

        self.add_value(field_name, values, *processors, **kw) 

 

    def replace_xpath(self, field_name, xpath, *processors, **kw): 

        values = self._get_values(xpath, **kw) 

        self.replace_value(field_name, values, *processors, **kw) 

 

    def get_xpath(self, xpath, *processors, **kw): 

        values = self._get_values(xpath, **kw) 

        return self.get_value(values, *processors, **kw) 

 

    def _get_values(self, xpaths, **kw): 

        xpaths = arg_to_iter(xpaths) 

        return flatten([self.selector.select(xpath).extract() for xpath in xpaths])