Source code for kingfisher_scrapy.base_spiders.compressed_file_spider

from io import BytesIO
from zipfile import ZipFile

import orjson
from pathvalidate import sanitize_filename
from rarfile import RarFile

from kingfisher_scrapy.base_spiders import BaseSpider
from kingfisher_scrapy.exceptions import UnknownArchiveFormatError
from kingfisher_scrapy.util import get_file_name_and_extension



[docs]
class CompressedFileSpider(BaseSpider):
    """
    Collect data from ZIP or RAR files.

    It assumes all files have the same data type. Each compressed file is saved to disk. The archive file is *not*
    saved to disk.

    #. Inherit from ``CompressedFileSpider``
    #. Set a ``data_type`` class attribute to the data type of the compressed files
    #. Optionally, add a ``resize_package = True`` class attribute to split large packages (e.g. greater than 100MB)
    #. Optionally, add a ``yield_non_archive_file = True`` class attribute if the spider requests both archive files
       and JSON files. Otherwise, the spider raises an ``UnknownArchiveFormatError`` exception.
    #. Optionally, add a ``file_name_must_contain = 'text'`` class attribute to only decompress the files whose paths
       contain the given text.
    #. Optionally, add a ``file_name_must_not_contain = 'text'`` class attribute to only decompress the files whose
       paths do not contain the given text.
    #. Optionally, add a ``skip_empty_releases = True`` class attribute to skip files with empty ``releases`` arrays.
    #. Write a ``start()`` method to request the archive files

    .. code-block:: python

        from kingfisher_scrapy.base_spiders import CompressedFileSpider
        from kingfisher_scrapy.util import components

        class MySpider(CompressedFileSpider):
            name = 'my_spider'

            # CompressedFileSpider
            data_type = 'release_package'

            async def start(self):
                yield self.build_request('https://example.com/api/packages.zip', formatter=components(-1))

    .. note::

       ``concatenated_json = True``, ``line_delimited = True``, ``root_path``, ``data_type = 'release'`` and
       ``data_type = 'record'`` are not supported if ``resize_package = True``.
    """

    # BaseSpider
    dont_truncate = True

    yield_non_archive_file = False
    file_name_must_contain = ""
    file_name_must_not_contain = ""
    skip_empty_releases = False


[docs]
    def parse(self, response):
        yield from self.process_archive_file(response, response.request.meta["file_name"], response.body)



[docs]
    def process_archive_file(self, response, archive_file_name, archive_data):
        archive_name, archive_format = get_file_name_and_extension(archive_file_name)

        # NOTE: If support is added for additional archive formats, remember to update the `Data` type in `items.py`.
        if archive_format == "rar":
            cls = RarFile
        elif archive_format == "zip":
            cls = ZipFile
        elif self.yield_non_archive_file:
            yield self.build_file_from_response(
                response, file_name=archive_file_name, data_type=self.data_type, data=archive_data
            )
            return
        else:
            raise UnknownArchiveFormatError(archive_file_name)

        # If we use a context manager here, the archive file might close before the item pipeline reads from the file
        # handlers of the compressed files.

        archive_file = cls(BytesIO(archive_data))

        number = 1
        for file_info in archive_file.infolist():
            # Avoid reading the rest of a large file, since the rest of the items will be dropped.
            if self.sample and number > self.sample:
                break

            # Skip the file if its size is zero, it's a directory, it's in the __MACOSX directory, or it's excluded
            # by the spider's configuration.
            if (
                not file_info.file_size
                or (archive_format == "rar" and file_info.isdir())
                or (archive_format == "zip" and file_info.is_dir())
                or file_info.filename.startswith("__MACOSX")
                or self.file_name_must_contain not in file_info.filename
                or (self.file_name_must_not_contain and self.file_name_must_not_contain in file_info.filename)
            ):
                continue

            file_name = f"{archive_name}-{sanitize_filename(file_info.filename)}"

            # If the file is itself an archive.
            if file_info.filename.endswith((".rar", ".zip")):
                with archive_file.open(file_info.filename) as f:
                    yield from self.process_archive_file(response, file_name, f.read())
            else:
                if not file_name.endswith(".json"):
                    file_name += ".json"

                compressed_file = archive_file.open(file_info.filename)

                # If `resize_package = True`, then we need to open the file twice: once to extract the package metadata
                # and then to extract the releases themselves.
                if self.resize_package:
                    data = {"data": compressed_file, "package": archive_file.open(file_info.filename)}
                elif self.skip_empty_releases:
                    data = compressed_file.read()
                    if not orjson.loads(data)["releases"]:
                        continue
                else:
                    data = compressed_file

                # Spiders can override build_file and return None.
                item = self.build_file(
                    file_name=file_name,
                    url=response.request.url,
                    data_type=self.data_type,
                    data=data,
                )
                if item:
                    yield item
                    number += 1