Source code for kingfisher_scrapy.spiders.italy_anac

import datetime

from kingfisher_scrapy.base_spiders import BigFileSpider, CKANSpider
from kingfisher_scrapy.util import BROWSER_USER_AGENT, MAX_DOWNLOAD_TIMEOUT, components


[docs] class ItalyANAC(CKANSpider, BigFileSpider): """ Domain Autorità Nazionale Anticorruzione (ANAC) Caveats If the OCID is missing, the spider derives the ``ocid`` field from the ``id`` field. Spider arguments from_date Download only data from this month onward (YYYY-MM format). If ``until_date`` is provided, defaults to '2018-01'. until_date Download only data until this month (YYYY-MM format). If ``from_date`` is provided, defaults to the current month. API documentation https://dati.anticorruzione.it/opendata/about Bulk download documentation https://dati.anticorruzione.it/opendata/organization/anticorruzione """ name = "italy_anac" custom_settings = { "DOWNLOAD_TIMEOUT": MAX_DOWNLOAD_TIMEOUT * 2, # 1h "USER_AGENT": BROWSER_USER_AGENT, # Otherwise, API returns "Request Rejected" HTML } # BaseSpider date_format = "year-month" default_from_date = "2018-01" # SimpleSpider data_type = "release_package" # CKANSpider ckan_api_url = "https://dati.anticorruzione.it/opendata" ckan_search_query = "ocds" # e.g. https://dati.anticorruzione.it/opendata/download/dataset/ocds/filesystem/bulk/2022/01.json formatter = staticmethod(components(-2)) # CKANSpider def get_resource_date(self, resource): year, month = self.formatter(resource["url"]).split("-") return datetime.datetime(int(year), int(month), 1, tzinfo=datetime.timezone.utc) # ResizePackageMiddleware def ocid_fallback(self, release): # Extract the ocid from the release id as a fallback, like ocds-hu01ve-7608611 from ocds-hu01ve-7608611-01. return "-".join(release["id"].split("-")[:3])