Source code for kingfisher_scrapy.extensions.pluck

import csv
import os

import orjson
from scrapy import signals
from scrapy.exceptions import StopDownload

from kingfisher_scrapy import util
from kingfisher_scrapy.items import PluckedItem


# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension
[docs] class Pluck: """Appends one data value from one plucked item to a file. See the :ref:`pluck` command.""" def __init__(self, directory, max_bytes): self.directory = directory self.max_bytes = max_bytes # The number of bytes received. self.total_bytes_received = 0 # Whether `item_scraped` has been called. self.item_scraped_called = False
[docs] @classmethod def from_crawler(cls, crawler): directory = crawler.settings["KINGFISHER_PLUCK_PATH"] max_bytes = crawler.settings["KINGFISHER_PLUCK_MAX_BYTES"] extension = cls(directory=directory, max_bytes=max_bytes) crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped) crawler.signals.connect(extension.spider_closed, signal=signals.spider_closed) if max_bytes: crawler.signals.connect(extension.bytes_received, signal=signals.bytes_received) return extension
[docs] def bytes_received(self, data, request, spider): if ( not spider.pluck or spider.dont_truncate # We only limit bytes received for final requests (i.e. where the callback is the default `parse` method). or request.callback # ijson will parse the value at `root_path`, which can go to the end of the file. # https://github.com/ICRAR/ijson/issues/43 or spider.root_path # XLSX files must be read in full. or spider.unflatten ): return self.total_bytes_received += len(data) if self.total_bytes_received >= self.max_bytes: raise StopDownload(fail=False)
[docs] def item_scraped(self, item, spider): if not spider.pluck or self.item_scraped_called or not isinstance(item, PluckedItem): return self.item_scraped_called = True self._write(spider, item.value)
[docs] def spider_closed(self, spider, reason): if not spider.pluck or self.item_scraped_called: return self._write(spider, f"closed: {reason}")
def _write(self, spider, value): with open(os.path.join(self.directory, util.pluck_filename(spider)), "a+") as f: if not isinstance(value, str): value = orjson.dumps(value, default=util.default).decode() writer = csv.writer(f, lineterminator="\n") writer.writerow([value, spider.name])