Coverage for scrapy/contrib/spiders/crawl : 52%
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
|
""" This modules implements the CrawlSpider which is the recommended spider to use for scraping typical web sites that requires crawling pages.
See documentation in docs/topics/spiders.rst """
return x
self.link_extractor = link_extractor self.callback = callback self.cb_kwargs = cb_kwargs or {} self.process_links = process_links self.process_request = process_request if follow is None: self.follow = False if callback else True else: self.follow = follow
return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
return []
return results
if not isinstance(response, HtmlResponse): return seen = set() for n, rule in enumerate(self._rules): links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] if links and rule.process_links: links = rule.process_links(links) seen = seen.union(links) for link in links: r = Request(url=link.url, callback=self._response_downloaded) r.meta.update(rule=n, link_text=link.text) yield rule.process_request(r)
rule = self._rules[response.meta['rule']] return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item
if follow and settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True): for request_or_item in self._requests_to_follow(response): yield request_or_item
if callable(method): return method elif isinstance(method, basestring): return getattr(self, method, None)
rule.callback = get_method(rule.callback) rule.process_links = get_method(rule.process_links) rule.process_request = get_method(rule.process_request) |