Coverage for scrapy/contrib/downloadermiddleware/robotstxt : 45%
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
|
""" This is a middleware to respect robots.txt policies. To activate it you must enable this middleware and enable the ROBOTSTXT_OBEY setting.
"""
self.crawler = crawler self._parsers = {} self._spider_netlocs = {} self._useragents = {} dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed)
def from_crawler(cls, crawler):
useragent = self._useragents[spider] rp = self.robot_parser(request, spider) if rp and not rp.can_fetch(useragent, request.url): log.msg("Forbidden by robots.txt: %s" % request, log.DEBUG) raise IgnoreRequest
url = urlparse_cached(request) netloc = url.netloc if netloc not in self._parsers: self._parsers[netloc] = None robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc) robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY) dfd = self.crawler.engine.download(robotsreq, spider) dfd.addCallback(self._parse_robots) self._spider_netlocs[spider].add(netloc) return self._parsers[netloc]
rp = robotparser.RobotFileParser(response.url) rp.parse(response.body.splitlines()) self._parsers[urlparse_cached(response).netloc] = rp
self._spider_netlocs[spider] = set() self._useragents[spider] = spider.settings['USER_AGENT']
for netloc in self._spider_netlocs[spider]: del self._parsers[netloc] del self._spider_netlocs[spider] del self._useragents[spider] |