Coverage for scrapy/contrib/feedexport : 46%
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
|
""" Feed Exports extension
See documentation in docs/topics/feed-exports.rst """
"""Interface that all Feed Storages must implement"""
"""Initialize the storage with the parameters given in the URI"""
"""Open the storage for the given spider. It must return a file-like object that will be used for the exporters"""
"""Store the given file stream"""
return TemporaryFile(prefix='feed-')
return threads.deferToThread(self._store_in_thread, file)
raise NotImplementedError
try: import boto except ImportError: raise NotConfigured self.connect_s3 = boto.connect_s3 u = urlparse(uri) self.bucketname = u.hostname self.access_key = u.username or settings['AWS_ACCESS_KEY_ID'] self.secret_key = u.password or settings['AWS_SECRET_ACCESS_KEY'] self.keyname = u.path
file.seek(0) conn = self.connect_s3(self.access_key, self.secret_key) bucket = conn.get_bucket(self.bucketname, validate=False) key = bucket.new_key(self.keyname) key.set_contents_from_file(file) key.close()
u = urlparse(uri) self.host = u.hostname self.port = int(u.port or '21') self.username = u.username self.password = u.password self.path = u.path
file.seek(0) ftp = FTP() ftp.connect(self.host, self.port) ftp.login(self.username, self.password) dirname, filename = posixpath.split(self.path) ftp_makedirs_cwd(ftp, dirname) ftp.storbinary('STOR %s' % filename, file) ftp.quit()
self.file = file self.exporter = exporter self.storage = storage self.uri = uri self.itemcount = 0
self.format = settings['FEED_FORMAT'].lower() self.storages = self._load_components('FEED_STORAGES') self.exporters = self._load_components('FEED_EXPORTERS') if not self._storage_supported(self.urifmt): raise NotConfigured if not self._exporter_supported(self.format): raise NotConfigured self.store_empty = settings.getbool('FEED_STORE_EMPTY') uripar = settings['FEED_URI_PARAMS'] self._uripar = load_object(uripar) if uripar else lambda x, y: None self.slots = {} dispatcher.connect(self.open_spider, signals.spider_opened) dispatcher.connect(self.close_spider, signals.spider_closed) dispatcher.connect(self.item_scraped, signals.item_scraped)
uri = self.urifmt % self._get_uri_params(spider) storage = self._get_storage(uri) file = storage.open(spider) exporter = self._get_exporter(file) exporter.start_exporting() self.slots[spider] = SpiderSlot(file, exporter, storage, uri)
slot = self.slots.pop(spider) if not slot.itemcount and not self.store_empty: return slot.exporter.finish_exporting() logfmt = "%%s %s feed (%d items) in: %s" % (self.format, \ slot.itemcount, slot.uri) d = defer.maybeDeferred(slot.storage.store, slot.file) d.addCallback(lambda _: log.msg(logfmt % "Stored", spider=spider)) d.addErrback(log.err, logfmt % "Error storing", spider=spider) return d
slot = self.slots[spider] slot.exporter.export_item(item) slot.itemcount += 1 return item
conf = dict(settings['%s_BASE' % setting_prefix]) conf.update(settings[setting_prefix]) d = {} for k, v in conf.items(): try: d[k] = load_object(v) except NotConfigured: pass return d
if format in self.exporters: return True log.msg("Unknown feed format: %s" % format, log.ERROR)
scheme = urlparse(uri).scheme if scheme in self.storages: try: self._get_storage(uri) return True except NotConfigured: log.msg("Disabled feed storage scheme: %s" % scheme, log.ERROR) else: log.msg("Unknown feed storage scheme: %s" % scheme, log.ERROR)
return self.exporters[self.format](*a, **kw)
return self.storages[urlparse(uri).scheme](uri)
params = {} for k in dir(spider): params[k] = getattr(spider, k) ts = datetime.utcnow().replace(microsecond=0).isoformat().replace(':', '-') params['time'] = ts self._uripar(params, spider) return params |