Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

""" 

This module implements the HtmlImageLinkExtractor for extracting  

image links only. 

""" 

 

from urlparse import urljoin 

from scrapy.link import Link 

from scrapy.utils.url import canonicalize_url 

from scrapy.utils.python import unicode_to_str, flatten 

from scrapy.selector.libxml2sel import XPathSelectorList, HtmlXPathSelector 

 

class HTMLImageLinkExtractor(object): 

    '''HTMLImageLinkExtractor objects are intended to extract image links from HTML pages 

    given certain xpath locations. 

 

    These locations can be passed in a list/tuple either when instanciating the LinkExtractor, 

    or whenever you call extract_links. 

    If no locations are specified in any of these places, a default pattern '//img' will be used. 

    If locations are specified when instanciating the LinkExtractor, and also when calling extract_links, 

    both locations will be used for that call of extract_links''' 

 

    def __init__(self, locations=None, unique=True, canonicalize=True): 

        self.locations = flatten([locations]) 

        self.unique = unique 

        self.canonicalize = canonicalize 

 

    def extract_from_selector(self, selector, encoding, parent=None): 

        ret = [] 

        def _add_link(url_sel, alt_sel=None): 

            url = flatten([url_sel.extract()]) 

            alt = flatten([alt_sel.extract()]) if alt_sel else (u'', ) 

exit            if url: 

                ret.append(Link(unicode_to_str(url[0], encoding), alt[0])) 

 

47        if selector.xmlNode.type == 'element': 

            if selector.xmlNode.name == 'img': 

                _add_link(selector.select('@src'), selector.select('@alt') or \ 

                          selector.select('@title')) 

            else: 

                children = selector.select('child::*') 

                if len(children): 

                    for child in children: 

                        ret.extend(self.extract_from_selector(child, encoding, parent=selector)) 

                elif selector.xmlNode.name == 'a' and not parent: 

                    _add_link(selector.select('@href'), selector.select('@title')) 

        else: 

            _add_link(selector) 

 

        return ret 

 

    def extract_links(self, response): 

        xs = HtmlXPathSelector(response) 

        base_url = xs.select('//base/@href').extract() 

        base_url = urljoin(response.url, base_url[0].encode(response.encoding)) if base_url else response.url 

 

        links = [] 

        for location in self.locations: 

            if isinstance(location, basestring): 

                selectors = xs.select(location) 

61            elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)): 

                selectors = [location] if isinstance(location, HtmlXPathSelector) else location 

            else: 

                continue 

 

            for selector in selectors: 

                links.extend(self.extract_from_selector(selector, response.encoding)) 

 

        seen, ret = set(), [] 

        for link in links: 

            link.url = urljoin(base_url, link.url) 

            if self.unique: 

                if link.url in seen: 

                    continue 

                else: 

                    seen.add(link.url) 

78            if self.canonicalize: 

                link.url = canonicalize_url(link.url) 

            ret.append(link) 

 

        return ret 

 

    def matches(self, url): 

        return False