Source code for kingfisher_scrapy.base_spiders.base_spider

import codecs
import datetime

import scrapy

from kingfisher_scrapy.exceptions import IncoherentConfigurationError, SpiderArgumentError
from kingfisher_scrapy.items import File, FileItem
from kingfisher_scrapy.util import add_path_components, add_query_string



[docs]
class BaseSpider(scrapy.Spider):
    """
    Base class for all spiders.

    With respect to the data's source:

    -  If the source can support ``from_date`` and ``until_date`` spider arguments:

       -  Set a ``date_format`` class attribute to "date", "datetime", "year" or "year-month" (default "date").
       -  Set a ``default_from_date`` class attribute to a date ("YYYY-MM-DD"), datetime ("YYYY-MM-DDTHH:MM:SS"),
          year ("YYYY") or year-month ("YYYY-MM").
       -  If the source stopped publishing, set a ``default_until_date`` class attribute to a date or datetime.

    -  If the spider requires date parameters to be set, add a ``date_required = True`` class attribute, and set the
       ``date_format`` and ``default_from_date`` class attributes as above.
    -  If the spider needs to parse the JSON response in its ``parse`` method, set ``dont_truncate = True``.
       Otherwise, the :class:`~kingfisher_scrapy.extensions.pluck.Pluck` extension can fail when using the
       ``KINGFISHER_PLUCK_MAX_BYTES`` setting.

    .. tip::

        If ``date_required`` is ``True``, or if either the ``from_date`` or ``until_date`` spider arguments are set,
        then ``from_date`` defaults to the ``default_from_date`` class attribute, and ``until_date`` defaults to the
        ``get_default_until_date()`` return value (which is the current time, by default).

    With respect to the data's format:

    -  If the data is not encoded using UTF-8, set an ``encoding`` class attribute to its encoding.
    -  If the data is concatenated JSON, add a ``concatenated_json = True`` class attribute.
    -  If the data is line-delimited JSON, add a ``line_delimited = True`` class attribute.
    -  If the data can be invalid JSON, add a ``validate_json = True`` class attribute.
    -  If the data embeds OCDS data within other objects or arrays, set a ``root_path`` class attribute to the path to
       the OCDS data, e.g. ``'releasePackage'`` or ``'results.item'``.
    -  If the data is in CSV or XLSX format, add a ``unflatten = True`` class attribute to convert it to JSON using
       Flatten Tool's ``unflatten`` function. To pass arguments to ``unflatten``, set a ``unflatten_args`` dict.
    -  If the data source uses OCDS 1.0, add an ``ocds_version = '1.0'`` class attribute. This is used for the
       :ref:`Kingfisher Process<kingfisher-process>` extension.

    With respect to support for Kingfisher Collect's features:

    -  If the spider doesn't work with the ``pluck`` command, set a ``skip_pluck`` class attribute to the reason.
    """

    VALID_DATE_FORMATS = {"date": "%Y-%m-%d", "datetime": "%Y-%m-%dT%H:%M:%S", "year": "%Y", "year-month": "%Y-%m"}

    # Regarding the data source.
    date_format = "date"
    date_required = False
    dont_truncate = False

    # Regarding the data format.
    encoding = "utf-8"
    concatenated_json = False
    line_delimited = False
    validate_json = False
    root_path = ""
    resize_package = False
    unflatten = False
    unflatten_args = {}
    ocds_version = "1.1"

    # Regarding the access method.
    max_attempts = 5
    retry_http_codes = []

    # Not to be overridden by sub-classes.
    available_steps = {"compile", "check"}

    def __init__(
        self,
        sample=None,
        path=None,
        from_date=None,
        until_date=None,
        crawl_time=None,
        note=None,
        keep_collection_open=None,
        steps=None,
        compile_releases=None,
        table_name=None,
        force_version=None,
        ignore_version=None,
        package_pointer=None,
        release_pointer=None,
        truncate=None,
        *args,
        **kwargs,
    ):
        """
        :param sample: the number of items to download (``'true'`` means ``1``; ``'false'`` and ``None`` mean no limit)
        :param path: path components to append to the URLs yielded by the ``start`` method (see :ref:`filter`)
        :param from_date: the date from which to download data (see :ref:`spider-arguments`)
        :param until_date: the date until which to download data (see :ref:`spider-arguments`)
        :param crawl_time: override the crawl's start time (see :ref:`increment`)
        :param note: a note to add to the collection in Kingfisher Process
        :param keep_collection_open: whether to close the collection in Kingfisher Process when the crawl is finished
        :param steps: a comma-separated list of steps to run in Kingfisher Process (``'compile'`` and/or ``'check'``)
        :param compile_releases: whether to create compiled releases from individual releases when using the
            :class:`~kingfisher_scrapy.extensions.database_store.DatabaseStore` extension
        :param table_name: override the crawl's table name in the database (see :ref:`database_store`)
        :param force_version: version to use instead of the version of the first package,
            if ``compile_releases`` is ``'true'``
        :param ignore_version: do not raise an error if the versions are inconsistent across packages to merge,
            if ``compile_releases`` is ``'true'``
        :param package_pointer: the JSON Pointer to the value in the package (see the :ref:`pluck` command)
        :param release_pointer: the JSON Pointer to the value in the release (see the :ref:`pluck` command)
        :param truncate: the number of characters to which the value is truncated (see the :ref:`pluck` command)
        """
        super().__init__(*args, **kwargs)

        if self.concatenated_json and self.line_delimited:
            raise IncoherentConfigurationError("concatenated_json = True is incompatible with line_delimited = True.")

        # https://docs.scrapy.org/en/latest/topics/spiders.html#spider-arguments

        # Related to filtering data from the source.
        if sample == "true":
            self.sample = 1
        elif sample == "false":
            self.sample = None
        else:
            self.sample = sample
        self.from_date = from_date
        self.until_date = until_date

        # Related to incremental crawls (whether KingfisherProcessAPI2 data_version or DatabaseStore directory).
        self.crawl_time = crawl_time

        # KingfisherProcessAPI2 extension.
        self.kingfisher_process_note = note
        self.kingfisher_process_keep_collection_open = keep_collection_open == "true"
        if steps is None:
            self.kingfisher_process_steps = {"compile"}
        else:
            self.kingfisher_process_steps = set(steps.split(",")) & self.available_steps

        # DatabaseStore extension.
        self.database_store_compile_releases = compile_releases == "true"
        self.database_store_table_name = table_name
        self.database_store_force_version = force_version
        self.database_store_ignore_version = ignore_version == "true"

        # Pluck pipeline.
        self.pluck_package_pointer = package_pointer
        self.pluck_release_pointer = release_pointer
        self.pluck_truncate = int(truncate) if truncate else None
        self.pluck = bool(package_pointer or release_pointer)

        self.query_string_parameters = {}
        for key, value in kwargs.items():
            if key.startswith("qs:"):
                self.query_string_parameters[key[3:]] = value

        self.date_format = self.VALID_DATE_FORMATS[self.date_format]

        if hasattr(self, "start"):
            if path:
                self.start = add_path_components(self.start, path)
            if self.query_string_parameters:
                self.start = add_query_string(self.start, self.query_string_parameters)

        self.filter_arguments = {
            "from_date": from_date,
            "until_date": until_date,
            "path": path,
        }
        self.filter_arguments.update(kwargs)

        spider_arguments = {
            "sample": sample,
            "note": note,
            "from_date": from_date,
            "until_date": until_date,
            "crawl_time": crawl_time,
            "keep_collection_open": keep_collection_open,
            "package_pointer": package_pointer,
            "release_pointer": release_pointer,
            "truncate": truncate,
            "compile_releases": compile_releases,
        }
        spider_arguments.update(kwargs)

        self.logger.info("Spider arguments: %r", spider_arguments)

    # Scrapy calls this method to merge the spider's custom_settings into the project's settings.

[docs]
    @classmethod
    def update_settings(cls, settings):
        if cls.custom_settings is None:
            cls.custom_settings = {}
        if not cls.custom_settings.get("HTTPPROXY_ENABLED"):
            cls.custom_settings["HTTPPROXY_ENABLED"] = cls.name in settings.getlist("PROXY_SPIDERS")
        super().update_settings(settings)



[docs]
    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super().from_crawler(crawler, *args, **kwargs)

        if spider.pluck_package_pointer and spider.pluck_release_pointer:
            raise SpiderArgumentError("You cannot specify both package_pointer and release_pointer spider arguments.")

        if spider.sample:
            crawler.settings.set("CONCURRENT_REQUESTS", 1, priority="spider")

            try:
                spider.sample = int(spider.sample)
            except ValueError:
                raise SpiderArgumentError(
                    f"spider argument `sample`: invalid integer value: {spider.sample!r}"
                ) from None

        if spider.crawl_time:
            try:
                spider.crawl_time = datetime.datetime.strptime(spider.crawl_time, "%Y-%m-%dT%H:%M:%S")
            except ValueError as e:
                raise SpiderArgumentError(f"spider argument `crawl_time`: invalid date value: {e}") from None

        if spider.from_date or spider.until_date or spider.date_required:
            if not spider.from_date:
                spider.from_date = spider.default_from_date
            try:
                if isinstance(spider.from_date, str):
                    spider.from_date = spider.parse_date_argument(spider.from_date)
            except ValueError as e:
                raise SpiderArgumentError(f"spider argument `from_date`: invalid date value: {e}") from None

            if not spider.until_date:
                spider.until_date = cls.get_default_until_date(spider)
            try:
                if isinstance(spider.until_date, str):
                    spider.until_date = spider.parse_date_argument(spider.until_date)
            except ValueError as e:
                raise SpiderArgumentError(f"spider argument `until_date`: invalid date value: {e}") from None

        # DatabaseStore-related logic.
        if crawler.settings["DATABASE_URL"] and not spider.crawl_time:
            raise SpiderArgumentError(
                "spider argument `crawl_time`: can't be blank if `DATABASE_URL` is set"
            ) from None

        return spider



[docs]
    def parse_date_argument(self, date):
        """Return the date argument as a datetime object."""
        return datetime.datetime.strptime(date, self.date_format).replace(tzinfo=datetime.timezone.utc)



[docs]
    def is_http_success(self, response):
        """Return whether the response's status is a 2xx code."""
        # All 2xx codes are successful.
        # https://tools.ietf.org/html/rfc7231#section-6.3
        return 200 <= response.status < 300



[docs]
    def is_http_error_expected(self, response):
        """Return whether the response's status is expected to be a non-2xx code."""
        return False



[docs]
    def is_http_retryable(self, response):
        """
        Return whether the response's status is retryable.

        Set the ``retry_http_codes`` class attribute to a list of status codes to retry.
        """
        return response.status in self.retry_http_codes



[docs]
    def get_start_time(self, date_format):
        """Return the formatted start time of the crawl."""
        date = self.crawl_time or self.crawler.stats.get_value("start_time")
        return date.strftime(date_format)



[docs]
    def get_retry_wait_time(self, response):
        """Return the number of seconds to wait before retrying a URL."""
        return int(response.headers.get("Retry-After", 30))



[docs]
    def build_request(self, url, formatter, **kwargs):
        """
        Return a Scrapy request, with a file name added to the request's ``meta`` attribute. If the file name doesn't
        have a ``.json``, ``.csv``, ``.xlsx``, ``.rar`` or ``.zip`` extension, it adds a ``.json`` extension.

        If the last component of a URL's path is unique, use it as the file name. For example:

        >>> from kingfisher_scrapy.base_spiders import BaseSpider
        >>> from kingfisher_scrapy.util import components
        >>> url = 'https://example.com/package.json'
        >>> formatter = components(-1)
        >>> BaseSpider(name='my_spider').build_request(url, formatter=formatter).meta
        {'file_name': 'package.json'}

        To use a query string parameter as the file name:

        >>> from kingfisher_scrapy.util import parameters
        >>> url = 'https://example.com/packages?page=1&per_page=100'
        >>> formatter = parameters('page')
        >>> BaseSpider(name='my_spider').build_request(url, formatter=formatter).meta
        {'file_name': 'page-1.json'}

        To use a URL path component *and* a query string parameter as the file name:

        >>> from kingfisher_scrapy.util import join
        >>> url = 'https://example.com/packages?page=1&per_page=100'
        >>> formatter = join(components(-1), parameters('page'))
        >>> BaseSpider(name='my_spider').build_request(url, formatter=formatter).meta
        {'file_name': 'packages-page-1.json'}

        :param str url: the URL to request
        :param formatter: a function that accepts a URL and returns a file name
        :returns: a Scrapy request
        :rtype: scrapy.Request
        """
        meta = {}
        if formatter is None:
            if not kwargs["meta"]["file_name"]:
                raise AssertionError("build_request() must be passed a file_name or a formatter")
        else:
            meta["file_name"] = formatter(url)
            # Other extensions are related to the Unflatten pipeline and CompressedFileSpider base class.
            if not meta["file_name"].endswith((".json", ".csv", ".xlsx", ".rar", ".zip")):
                meta["file_name"] += ".json"
        if "meta" in kwargs:
            meta.update(kwargs.pop("meta"))
        return scrapy.Request(url, meta=meta, **kwargs)



[docs]
    def build_file_from_response(self, response, /, *, data_type, **kwargs):
        """
        Return a File item to yield, based on the response to a request.

        If the response body starts with a byte-order mark, it is removed.
        """
        kwargs.setdefault("file_name", response.request.meta["file_name"])
        kwargs.setdefault("url", response.request.url)
        if "data" not in kwargs:
            body = response.body
            # https://tools.ietf.org/html/rfc7159#section-8.1
            if body.startswith(codecs.BOM_UTF8):  # noqa: FURB188 # bytes instances don't have a removeprefix method.
                body = body[len(codecs.BOM_UTF8) :]
            kwargs["data"] = body
        return self.build_file(data_type=data_type, **kwargs)



[docs]
    def build_file(self, *, file_name=None, url=None, data_type=None, data=None):
        """Return a File item to yield."""
        return File(
            file_name=file_name,
            url=url,
            data_type=data_type,
            data=data,
        )



[docs]
    def build_file_item(self, number, data, item):
        """Return a FileItem item to yield."""
        return FileItem(
            file_name=item.file_name,
            url=item.url,
            data_type=item.data_type,
            data=data,
            number=number,
        )



[docs]
    def log_error_from_response(self, response, *, level="error", status=None, message=""):
        """Log an error message, based on the response to a request."""
        getattr(self.logger, level)(
            "status=%d message=%r request=%s file_name=%s",
            status or response.status,
            message,
            response.request,
            response.request.meta.get("file_name", ""),
        )



[docs]
    @classmethod
    def get_default_until_date(cls, spider):
        """Return the ``default_until_date`` class attribute if truthy. Otherwise, return the current time."""
        if getattr(spider, "default_until_date", None):
            return spider.default_until_date
        return datetime.datetime.now(tz=datetime.timezone.utc)