Coverage for scrapy/utils/sitemap: 100%

"""

Module for processing Sitemaps.

Note: The main purpose of this module is to provide support for the

SitemapSpider, its API is subject to change without notice.

"""

from cStringIO import StringIO

from xml.etree.cElementTree import ElementTree

class Sitemap(object):

"""Class to parse Sitemap (type=urlset) and Sitemap Index

(type=sitemapindex) files"""

def __init__(self, xmltext):

tree = ElementTree()

tree.parse(StringIO(xmltext))

self._root = tree.getroot()

rt = self._root.tag

self.type = self._root.tag.split('}', 1)[1] if '}' in rt else rt

def __iter__(self):

for elem in self._root.getchildren():

d = {}

for el in elem.getchildren():

tag = el.tag

name = tag.split('}', 1)[1] if '}' in tag else tag

d[name] = el.text.strip() if el.text else ''

yield d

def sitemap_urls_from_robots(robots_text):

"""Return an iterator over all sitemap urls contained in the given

robots.txt file

"""

for line in robots_text.splitlines():

if line.lstrip().startswith('Sitemap:'):

yield line.split(':', 1)[1].strip()

Coverage for scrapy/utils/sitemap : 100%