Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

""" 

Module for processing Sitemaps. 

 

Note: The main purpose of this module is to provide support for the 

SitemapSpider, its API is subject to change without notice. 

""" 

 

from cStringIO import StringIO 

from xml.etree.cElementTree import ElementTree 

 

class Sitemap(object): 

    """Class to parse Sitemap (type=urlset) and Sitemap Index 

    (type=sitemapindex) files""" 

 

    def __init__(self, xmltext): 

        tree = ElementTree() 

        tree.parse(StringIO(xmltext)) 

        self._root = tree.getroot() 

        rt = self._root.tag 

        self.type = self._root.tag.split('}', 1)[1] if '}' in rt else rt 

 

    def __iter__(self): 

        for elem in self._root.getchildren(): 

            d = {} 

            for el in elem.getchildren(): 

                tag = el.tag 

                name = tag.split('}', 1)[1] if '}' in tag else tag 

                d[name] = el.text.strip() if el.text else '' 

            yield d 

 

def sitemap_urls_from_robots(robots_text): 

    """Return an iterator over all sitemap urls contained in the given 

    robots.txt file 

    """ 

    for line in robots_text.splitlines(): 

        if line.lstrip().startswith('Sitemap:'): 

            yield line.split(':', 1)[1].strip()