Source code for kingfisher_scrapy.extensions.files_store
import math
import os
import zlib
import orjson
from scrapy import signals
from scrapy.exceptions import NotConfigured
from kingfisher_scrapy import util
from kingfisher_scrapy.items import File, FileItem
[docs]
class FilesStore:
"""Write items' data to individual files in a directory. See the :ref:`how-it-works` documentation."""
def __init__(self, directory):
self.directory = directory
[docs]
@classmethod
def relative_crawl_directory(cls, spider):
"""Return the crawl's relative directory, in the format `<spider_name>[_sample]/<YYMMDD_HHMMSS>`."""
spider_directory = spider.name
if spider.sample:
spider_directory += "_sample"
return os.path.join(spider_directory, spider.get_start_time("%Y%m%d_%H%M%S"))
[docs]
@classmethod
def from_crawler(cls, crawler):
directory = crawler.settings["FILES_STORE"]
if not directory:
raise NotConfigured("FILES_STORE is not set.")
extension = cls(directory)
crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(extension.spider_closed, signal=signals.spider_closed)
return extension
[docs]
def spider_opened(self, spider):
if hasattr(spider, "_job"):
path = os.path.join(self.relative_crawl_directory(spider), "scrapyd-job.txt")
self._write_file(path, spider._job)
[docs]
def spider_closed(self, spider, reason):
if reason not in {"finished", "sample"} or spider.pluck:
return
path = os.path.join(self.directory, self.relative_crawl_directory(spider))
if os.path.exists(path):
message = f"The data is available at: {path}"
else:
message = "Something went wrong. No data was downloaded."
message_length = math.ceil(len(message) / 2) * 2
title_length = message_length // 2 - 8
spider.logger.info(f"+-{'-' * title_length} DATA DIRECTORY {'-' * title_length}-+") # noqa: G004
spider.logger.info(f"| {' ' * message_length} |") # noqa: G004
spider.logger.info(f"| {message.ljust(message_length)} |") # noqa: G004
spider.logger.info(f"| {' ' * message_length} |") # noqa: G004
spider.logger.info(f"+-{'-' * message_length}-+") # noqa: G004
[docs]
def item_scraped(self, item, spider):
"""
If the item is a File or FileItem, write its data to the filename in a subdirectory of the crawl directory.
Return a dict with the metadata.
"""
if not isinstance(item, File | FileItem):
return
file_name = item.file_name
if isinstance(item, FileItem):
name, extension = util.get_file_name_and_extension(file_name)
file_name = f"{name}-{item.number}.{extension}"
path = os.path.join(self.relative_crawl_directory(spider), self._get_subdirectory(file_name), file_name)
self._write_file(path, item.data)
item.path = path
# https://github.com/rails/rails/blob/05ed261/activesupport/lib/active_support/cache/file_store.rb#L150-L175
@staticmethod
def _get_subdirectory(file_name):
checksum = zlib.adler32(file_name.encode())
checksum, dir_1 = divmod(checksum, 0x1000)
# 0x1000 is 4096, which should be sufficient, without another level of: dir_2 = checksum % 0x1000
return f"{dir_1:03X}"
def _write_file(self, path, data):
path = os.path.join(self.directory, path)
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "wb") as f:
if isinstance(data, bytes):
f.write(data)
elif isinstance(data, str):
f.write(data.encode()) # NOTE: should be UTF-8
else:
f.write(orjson.dumps(data, default=util.default))