Source code for kingfisher_scrapy.base_spiders.compressed_file_spider
from io import BytesIO
from zipfile import ZipFile
import orjson
from pathvalidate import sanitize_filename
from rarfile import RarFile
from kingfisher_scrapy.base_spiders import BaseSpider
from kingfisher_scrapy.exceptions import UnknownArchiveFormatError
from kingfisher_scrapy.util import get_file_name_and_extension
[docs]
class CompressedFileSpider(BaseSpider):
"""
Collect data from ZIP or RAR files.
It assumes all files have the same data type. Each compressed file is saved to disk. The archive file is *not*
saved to disk.
#. Inherit from ``CompressedFileSpider``
#. Set a ``data_type`` class attribute to the data type of the compressed files
#. Optionally, add a ``resize_package = True`` class attribute to split large packages (e.g. greater than 100MB)
#. Optionally, add a ``yield_non_archive_file = True`` class attribute if the spider requests both archive files
and JSON files. Otherwise, the spider raises an ``UnknownArchiveFormatError`` exception.
#. Optionally, add a ``file_name_must_contain = 'text'`` class attribute to only decompress the files whose paths
contain the given text.
#. Optionally, add a ``file_name_must_not_contain = 'text'`` class attribute to only decompress the files whose
paths do not contain the given text.
#. Optionally, add a ``skip_empty_releases = True`` class attribute to skip files with empty ``releases`` arrays.
#. Write a ``start()`` method to request the archive files
.. code-block:: python
from kingfisher_scrapy.base_spiders import CompressedFileSpider
from kingfisher_scrapy.util import components
class MySpider(CompressedFileSpider):
name = 'my_spider'
# CompressedFileSpider
data_type = 'release_package'
async def start(self):
yield self.build_request('https://example.com/api/packages.zip', formatter=components(-1))
.. note::
``concatenated_json = True``, ``line_delimited = True``, ``root_path``, ``data_type = 'release'`` and
``data_type = 'record'`` are not supported if ``resize_package = True``.
"""
# BaseSpider
dont_truncate = True
yield_non_archive_file = False
file_name_must_contain = ""
file_name_must_not_contain = ""
skip_empty_releases = False
[docs]
def parse(self, response):
yield from self.process_archive_file(response, response.request.meta["file_name"], response.body)
[docs]
def process_archive_file(self, response, archive_file_name, archive_data):
archive_name, archive_format = get_file_name_and_extension(archive_file_name)
# NOTE: If support is added for additional archive formats, remember to update the `Data` type in `items.py`.
if archive_format == "rar":
cls = RarFile
elif archive_format == "zip":
cls = ZipFile
elif self.yield_non_archive_file:
yield self.build_file_from_response(
response, file_name=archive_file_name, data_type=self.data_type, data=archive_data
)
return
else:
raise UnknownArchiveFormatError(archive_file_name)
# If we use a context manager here, the archive file might close before the item pipeline reads from the file
# handlers of the compressed files.
archive_file = cls(BytesIO(archive_data))
number = 1
for file_info in archive_file.infolist():
# Avoid reading the rest of a large file, since the rest of the items will be dropped.
if self.sample and number > self.sample:
break
# Skip the file if its size is zero, it's a directory, it's in the __MACOSX directory, or it's excluded
# by the spider's configuration.
if (
not file_info.file_size
or (archive_format == "rar" and file_info.isdir())
or (archive_format == "zip" and file_info.is_dir())
or file_info.filename.startswith("__MACOSX")
or self.file_name_must_contain not in file_info.filename
or (self.file_name_must_not_contain and self.file_name_must_not_contain in file_info.filename)
):
continue
file_name = f"{archive_name}-{sanitize_filename(file_info.filename)}"
# If the file is itself an archive.
if file_info.filename.endswith((".rar", ".zip")):
with archive_file.open(file_info.filename) as f:
yield from self.process_archive_file(response, file_name, f.read())
else:
if not file_name.endswith(".json"):
file_name += ".json"
compressed_file = archive_file.open(file_info.filename)
# If `resize_package = True`, then we need to open the file twice: once to extract the package metadata
# and then to extract the releases themselves.
if self.resize_package:
data = {"data": compressed_file, "package": archive_file.open(file_info.filename)}
elif self.skip_empty_releases:
data = compressed_file.read()
if not orjson.loads(data)["releases"]:
continue
else:
data = compressed_file
# Spiders can override build_file and return None.
item = self.build_file(
file_name=file_name,
url=response.request.url,
data_type=self.data_type,
data=data,
)
if item:
yield item
number += 1