Source code for kingfisher_scrapy.extensions.pluck
import csv
import os
import orjson
from scrapy import signals
from scrapy.exceptions import StopDownload
from kingfisher_scrapy import util
from kingfisher_scrapy.items import PluckedItem
# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension
[docs]
class Pluck:
"""Appends one data value from one plucked item to a file. See the :ref:`pluck` command."""
def __init__(self, directory, max_bytes):
self.directory = directory
self.max_bytes = max_bytes
# The number of bytes received.
self.total_bytes_received = 0
# Whether `item_scraped` has been called.
self.item_scraped_called = False
[docs]
@classmethod
def from_crawler(cls, crawler):
directory = crawler.settings["KINGFISHER_PLUCK_PATH"]
max_bytes = crawler.settings["KINGFISHER_PLUCK_MAX_BYTES"]
extension = cls(directory=directory, max_bytes=max_bytes)
crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(extension.spider_closed, signal=signals.spider_closed)
if max_bytes:
crawler.signals.connect(extension.bytes_received, signal=signals.bytes_received)
return extension
[docs]
def bytes_received(self, data, request, spider):
if (
not spider.pluck
or spider.dont_truncate
# We only limit bytes received for final requests (i.e. where the callback is the default `parse` method).
or request.callback
# ijson will parse the value at `root_path`, which can go to the end of the file.
# https://github.com/ICRAR/ijson/issues/43
or spider.root_path
# XLSX files must be read in full.
or spider.unflatten
):
return
self.total_bytes_received += len(data)
if self.total_bytes_received >= self.max_bytes:
raise StopDownload(fail=False)
[docs]
def item_scraped(self, item, spider):
if not spider.pluck or self.item_scraped_called or not isinstance(item, PluckedItem):
return
self.item_scraped_called = True
self._write(spider, item.value)
[docs]
def spider_closed(self, spider, reason):
if not spider.pluck or self.item_scraped_called:
return
self._write(spider, f"closed: {reason}")
def _write(self, spider, value):
with open(os.path.join(self.directory, util.pluck_filename(spider)), "a+") as f:
if not isinstance(value, str):
value = orjson.dumps(value, default=util.default).decode()
writer = csv.writer(f, lineterminator="\n")
writer.writerow([value, spider.name])