Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

""" 

XPath selectors based on lxml 

""" 

 

from lxml import etree 

 

from scrapy.utils.misc import extract_regex 

from scrapy.utils.trackref import object_ref 

from scrapy.utils.python import unicode_to_str 

from scrapy.utils.decorator import deprecated 

from scrapy.http import TextResponse 

from .list import XPathSelectorList 

 

__all__ = ['HtmlXPathSelector', 'XmlXPathSelector', 'XPathSelector', \ 

    'XPathSelectorList'] 

 

class XPathSelector(object_ref): 

 

    __slots__ = ['response', 'text', 'expr', 'namespaces', '_root', '_xpathev', \ 

        '__weakref__'] 

    _parser = etree.HTMLParser 

    _tostring_method = 'html' 

 

    def __init__(self, response=None, text=None, root=None, expr=None, namespaces=None): 

        if text: 

            self.response = TextResponse(url='about:blank', \ 

                body=unicode_to_str(text, 'utf-8'), encoding='utf-8') 

        else: 

            self.response = response 

        self._root = root 

        self._xpathev = None 

        self.namespaces = namespaces 

        self.expr = expr 

 

    @property 

    def root(self): 

        if self._root is None: 

            parser = self._parser(encoding=self.response.encoding, recover=True) 

            self._root = etree.fromstring(self.response.body, parser=parser, \ 

                base_url=self.response.url) 

        return self._root 

 

    @property 

    def xpathev(self): 

        if self._xpathev is None: 

            self._xpathev = etree.XPathEvaluator(self.root, namespaces=self.namespaces) 

        return self._xpathev 

 

    def select(self, xpath): 

        try: 

            result = self.xpathev(xpath) 

        except etree.XPathError: 

            raise ValueError("Invalid XPath: %s" % xpath) 

        if hasattr(result, '__iter__'): 

            result = [self.__class__(root=x, expr=xpath, namespaces=self.namespaces) \ 

                for x in result] 

        else: 

            result = [self.__class__(root=result, expr=xpath, namespaces=self.namespaces)] 

        return XPathSelectorList(result) 

 

    def re(self, regex): 

        return extract_regex(regex, self.extract()) 

 

    def extract(self): 

        try: 

            return etree.tostring(self.root, method=self._tostring_method, \ 

                encoding=unicode) 

        except (AttributeError, TypeError): 

            return unicode(self.root) 

 

    def register_namespace(self, prefix, uri): 

        if self.namespaces is None: 

            self.namespaces = {} 

        self.namespaces[prefix] = uri 

 

    def __nonzero__(self): 

        return bool(self.extract()) 

 

    def __str__(self): 

        data = repr(self.extract()[:40]) 

        return "<%s xpath=%r data=%s>" % (type(self).__name__, self.expr, data) 

 

    __repr__ = __str__ 

 

 

    @deprecated(use_instead='XPathSelector.extract') 

    def extract_unquoted(self): 

        return self.extract() 

 

 

class XmlXPathSelector(XPathSelector): 

    __slots__ = () 

    _parser = etree.XMLParser 

    _tostring_method = 'xml' 

 

 

class HtmlXPathSelector(XPathSelector): 

    __slots__ = () 

    _parser = etree.HTMLParser 

    _tostring_method = 'html'