Source code for kingfisher_scrapy.base_spiders.compressed_file_spider

from io import BytesIO
from zipfile import ZipFile

import orjson
from pathvalidate import sanitize_filename
from rarfile import RarFile

from kingfisher_scrapy.base_spiders import BaseSpider
from kingfisher_scrapy.exceptions import UnknownArchiveFormatError
from kingfisher_scrapy.util import get_file_name_and_extension


[docs] class CompressedFileSpider(BaseSpider): """ Collect data from ZIP or RAR files. It assumes all files have the same data type. Each compressed file is saved to disk. The archive file is *not* saved to disk. #. Inherit from ``CompressedFileSpider`` #. Set a ``data_type`` class attribute to the data type of the compressed files #. Optionally, add a ``resize_package = True`` class attribute to split large packages (e.g. greater than 100MB) #. Optionally, add a ``yield_non_archive_file = True`` class attribute if the spider requests both archive files and JSON files. Otherwise, the spider raises an ``UnknownArchiveFormatError`` exception. #. Optionally, add a ``file_name_must_contain = 'text'`` class attribute to only decompress the files whose paths contain the given text. #. Optionally, add a ``file_name_must_not_contain = 'text'`` class attribute to only decompress the files whose paths do not contain the given text. #. Optionally, add a ``skip_empty_releases = True`` class attribute to skip files with empty ``releases`` arrays. #. Write a ``start()`` method to request the archive files .. code-block:: python from kingfisher_scrapy.base_spiders import CompressedFileSpider from kingfisher_scrapy.util import components class MySpider(CompressedFileSpider): name = 'my_spider' # CompressedFileSpider data_type = 'release_package' async def start(self): yield self.build_request('https://example.com/api/packages.zip', formatter=components(-1)) .. note:: ``concatenated_json = True``, ``line_delimited = True``, ``root_path``, ``data_type = 'release'`` and ``data_type = 'record'`` are not supported if ``resize_package = True``. """ # BaseSpider dont_truncate = True yield_non_archive_file = False file_name_must_contain = "" file_name_must_not_contain = "" skip_empty_releases = False
[docs] def parse(self, response): yield from self.process_archive_file(response, response.request.meta["file_name"], response.body)
[docs] def process_archive_file(self, response, archive_file_name, archive_data): archive_name, archive_format = get_file_name_and_extension(archive_file_name) # NOTE: If support is added for additional archive formats, remember to update the `Data` type in `items.py`. if archive_format == "rar": cls = RarFile elif archive_format == "zip": cls = ZipFile elif self.yield_non_archive_file: yield self.build_file_from_response( response, file_name=archive_file_name, data_type=self.data_type, data=archive_data ) return else: raise UnknownArchiveFormatError(archive_file_name) # If we use a context manager here, the archive file might close before the item pipeline reads from the file # handlers of the compressed files. archive_file = cls(BytesIO(archive_data)) number = 1 for file_info in archive_file.infolist(): # Avoid reading the rest of a large file, since the rest of the items will be dropped. if self.sample and number > self.sample: break # Skip the file if its size is zero, it's a directory, it's in the __MACOSX directory, or it's excluded # by the spider's configuration. if ( not file_info.file_size or (archive_format == "rar" and file_info.isdir()) or (archive_format == "zip" and file_info.is_dir()) or file_info.filename.startswith("__MACOSX") or self.file_name_must_contain not in file_info.filename or (self.file_name_must_not_contain and self.file_name_must_not_contain in file_info.filename) ): continue file_name = f"{archive_name}-{sanitize_filename(file_info.filename)}" # If the file is itself an archive. if file_info.filename.endswith((".rar", ".zip")): with archive_file.open(file_info.filename) as f: yield from self.process_archive_file(response, file_name, f.read()) else: if not file_name.endswith(".json"): file_name += ".json" compressed_file = archive_file.open(file_info.filename) # If `resize_package = True`, then we need to open the file twice: once to extract the package metadata # and then to extract the releases themselves. if self.resize_package: data = {"data": compressed_file, "package": archive_file.open(file_info.filename)} elif self.skip_empty_releases: data = compressed_file.read() if not orjson.loads(data)["releases"]: continue else: data = compressed_file # Spiders can override build_file and return None. item = self.build_file( file_name=file_name, url=response.request.url, data_type=self.data_type, data=data, ) if item: yield item number += 1