Source code for kingfisher_scrapy.extensions.pluck

import os

from scrapy import signals
from scrapy.exceptions import StopDownload

from kingfisher_scrapy import util
from kingfisher_scrapy.items import PluckedItem


# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension
[docs] class Pluck: """ Appends one data value from one plucked item to a file. See the :ref:`pluck` command. """ def __init__(self, directory, max_bytes): self.directory = directory self.max_bytes = max_bytes # The number of bytes received. self.total_bytes_received = 0 # Whether `item_scraped` has been called. self.item_scraped_called = False
[docs] @classmethod def from_crawler(cls, crawler): directory = crawler.settings['KINGFISHER_PLUCK_PATH'] max_bytes = crawler.settings['KINGFISHER_PLUCK_MAX_BYTES'] extension = cls(directory=directory, max_bytes=max_bytes) crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped) crawler.signals.connect(extension.spider_closed, signal=signals.spider_closed) if max_bytes: crawler.signals.connect(extension.bytes_received, signal=signals.bytes_received) return extension
[docs] def bytes_received(self, data, request, spider): if ( not spider.pluck or spider.dont_truncate # We only limit bytes received for final requests (i.e. where the callback is the default `parse` method). or request.callback # ijson will parse the value at `root_path`, which can go to the end of the file. # https://github.com/ICRAR/ijson/issues/43 or spider.root_path # XLSX files must be read in full. or spider.unflatten ): return self.total_bytes_received += len(data) if self.total_bytes_received >= self.max_bytes: raise StopDownload(fail=False)
[docs] def item_scraped(self, item, spider): if not spider.pluck or self.item_scraped_called or not isinstance(item, PluckedItem): return self.item_scraped_called = True self._write(spider, item.value)
[docs] def spider_closed(self, spider, reason): if not spider.pluck or self.item_scraped_called: return self._write(spider, f'closed: {reason}')
def _write(self, spider, value): with open(os.path.join(self.directory, util.pluck_filename(spider)), 'a+') as f: f.write(f'{value},{spider.name}\n')