Source code for kingfisher_scrapy.spiders.guatemala_bulk
import datetime
import scrapy
from kingfisher_scrapy.base_spiders import CompressedFileSpider
from kingfisher_scrapy.util import components, join
[docs]
class GuatemalaBulk(CompressedFileSpider):
"""
Domain
Ministerio de Finanzas Públicas - Dirección General de Adquisiciones del Estado
Spider arguments
from_date
Download only data from this month onward (YYYY-MM format).
If ``until_date`` is provided, defaults to '2020-01'.
until_date
Download only data until this month (YYYY-MM format).
If ``from_date`` is provided, defaults to the current month.
API documentation
https://ocds.guatecompras.gt/api-ocds
Bulk download documentation
https://ocds.guatecompras.gt/descarga-datos
"""
name = "guatemala_bulk"
custom_settings = {
# Reduce the number of concurrent requests to avoid multiple failures.
"CONCURRENT_REQUESTS": 1,
}
retry_http_codes = [400]
# BaseSpider
date_format = "year-month"
default_from_date = "2020-01"
# SimpleSpider
data_type = "record_package"
async def start(self):
yield scrapy.Request("https://ocds.guatecompras.gt/files", callback=self.parse_list)
def parse_list(self, response):
"""
The response looks like:
{
"id": "gc-{year}-{month}",
"results": [
{
"files": {
"csv": "...",
"sha": "...",
"json": "...",
"xlsx": "..."
},
"year": "values between 2020 to the current year",
"month": "values between 1 and 12",
"monthName": "values between enero to diciembre",
"source": "Guatecompras",
"timestamp": "last updated date in timestamp with time zone format"
}, ...
]
}
"""
for item in response.json()["result"]:
if self.from_date and self.until_date:
date = datetime.datetime(int(item["year"]), int(item["month"]), 1, tzinfo=datetime.timezone.utc)
if not (self.from_date <= date <= self.until_date):
continue
yield self.build_request(item["files"]["json"], formatter=join(components(-2), extension="zip"))