import os
from scrapy import signals
from scrapy.exceptions import StopDownload
from kingfisher_scrapy import util
from kingfisher_scrapy.items import PluckedItem
# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension
[docs]
class Pluck:
"""
Appends one data value from one plucked item to a file. See the :ref:`pluck` command.
"""
def __init__(self, directory, max_bytes):
self.directory = directory
self.max_bytes = max_bytes
# The number of bytes received.
self.total_bytes_received = 0
# Whether `item_scraped` has been called.
self.item_scraped_called = False
[docs]
@classmethod
def from_crawler(cls, crawler):
directory = crawler.settings['KINGFISHER_PLUCK_PATH']
max_bytes = crawler.settings['KINGFISHER_PLUCK_MAX_BYTES']
extension = cls(directory=directory, max_bytes=max_bytes)
crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(extension.spider_closed, signal=signals.spider_closed)
if max_bytes:
crawler.signals.connect(extension.bytes_received, signal=signals.bytes_received)
return extension
[docs]
def bytes_received(self, data, request, spider):
if (
not spider.pluck
or spider.dont_truncate
# We only limit bytes received for final requests (i.e. where the callback is the default `parse` method).
or request.callback
# ijson will parse the value at `root_path`, which can go to the end of the file.
# https://github.com/ICRAR/ijson/issues/43
or spider.root_path
# XLSX files must be read in full.
or spider.unflatten
):
return
self.total_bytes_received += len(data)
if self.total_bytes_received >= self.max_bytes:
raise StopDownload(fail=False)
[docs]
def item_scraped(self, item, spider):
if not spider.pluck or self.item_scraped_called or not isinstance(item, PluckedItem):
return
self.item_scraped_called = True
self._write(spider, item.value)
[docs]
def spider_closed(self, spider, reason):
if not spider.pluck or self.item_scraped_called:
return
self._write(spider, f'closed: {reason}')
def _write(self, spider, value):
with open(os.path.join(self.directory, util.pluck_filename(spider)), 'a+') as f:
f.write(f'{value},{spider.name}\n')