Source code for kingfisher_scrapy.spiders.guatemala_bulk

import datetime

import scrapy

from kingfisher_scrapy.base_spiders import CompressedFileSpider
from kingfisher_scrapy.util import components, join


[docs] class GuatemalaBulk(CompressedFileSpider): """ Domain Ministerio de Finanzas Públicas - Dirección General de Adquisiciones del Estado Spider arguments from_date Download only data from this month onward (YYYY-MM format). If ``until_date`` is provided, defaults to '2020-01'. until_date Download only data until this month (YYYY-MM format). If ``from_date`` is provided, defaults to the current month. API documentation https://ocds.guatecompras.gt/api-ocds Bulk download documentation https://ocds.guatecompras.gt/descarga-datos """ name = "guatemala_bulk" custom_settings = { # Reduce the number of concurrent requests to avoid multiple failures. "CONCURRENT_REQUESTS": 1, } retry_http_codes = [400] # BaseSpider date_format = "year-month" default_from_date = "2020-01" # SimpleSpider data_type = "record_package" async def start(self): yield scrapy.Request("https://ocds.guatecompras.gt/files", callback=self.parse_list) def parse_list(self, response): """ The response looks like: { "id": "gc-{year}-{month}", "results": [ { "files": { "csv": "...", "sha": "...", "json": "...", "xlsx": "..." }, "year": "values between 2020 to the current year", "month": "values between 1 and 12", "monthName": "values between enero to diciembre", "source": "Guatecompras", "timestamp": "last updated date in timestamp with time zone format" }, ... ] } """ for item in response.json()["result"]: if self.from_date and self.until_date: date = datetime.datetime(int(item["year"]), int(item["month"]), 1, tzinfo=datetime.timezone.utc) if not (self.from_date <= date <= self.until_date): continue yield self.build_request(item["files"]["json"], formatter=join(components(-2), extension="zip"))