Source code for kingfisher_scrapy.spiders.italy_anac

import datetime

from kingfisher_scrapy.base_spiders import BigFileSpider, CKANSpider
from kingfisher_scrapy.util import BROWSER_USER_AGENT, MAX_DOWNLOAD_TIMEOUT, components



[docs]
class ItalyANAC(CKANSpider, BigFileSpider):
    """
    Domain
      Autorità Nazionale Anticorruzione (ANAC)
    Caveats
      If the OCID is missing, the spider derives the ``ocid`` field from the ``id`` field.
    Spider arguments
      from_date
        Download only data from this month onward (YYYY-MM format).
        If ``until_date`` is provided, defaults to '2018-01'.
      until_date
        Download only data until this month (YYYY-MM format).
        If ``from_date`` is provided, defaults to the current month.
    API documentation
      https://dati.anticorruzione.it/opendata/about
    Bulk download documentation
      https://dati.anticorruzione.it/opendata/organization/anticorruzione
    """

    name = "italy_anac"
    custom_settings = {
        "DOWNLOAD_TIMEOUT": MAX_DOWNLOAD_TIMEOUT * 2,  # 1h
        "USER_AGENT": BROWSER_USER_AGENT,  # Otherwise, API returns "Request Rejected" HTML
    }

    # BaseSpider
    date_format = "year-month"
    default_from_date = "2018-01"

    # SimpleSpider
    data_type = "release_package"

    # CKANSpider
    ckan_api_url = "https://dati.anticorruzione.it/opendata"
    ckan_search_query = "ocds"
    # e.g. https://dati.anticorruzione.it/opendata/download/dataset/ocds/filesystem/bulk/2022/01.json
    formatter = staticmethod(components(-2))

    # CKANSpider
    def get_resource_date(self, resource):
        year, month = self.formatter(resource["url"]).split("-")
        return datetime.datetime(int(year), int(month), 1, tzinfo=datetime.timezone.utc)

    # ResizePackageMiddleware
    def ocid_fallback(self, release):
        # Extract the ocid from the release id as a fallback, like ocds-hu01ve-7608611 from ocds-hu01ve-7608611-01.
        return "-".join(release["id"].split("-")[:3])