Source code for kingfisher_scrapy.spiders.honduras_portal_bulk
import datetime
import scrapy
from kingfisher_scrapy.base_spiders import SimpleSpider
from kingfisher_scrapy.exceptions import SpiderArgumentError
from kingfisher_scrapy.util import components
[docs]
class HondurasPortalBulk(SimpleSpider):
"""
Domain
Oficina Normativa de Contratación y Adquisiciones del Estado (ONCAE) / Secretaria de Finanzas de Honduras (SEFIN)
Spider arguments
from_date
Download only data from this month onward (YYYY-MM format).
If ``until_date`` is provided, defaults to '2005-11'.
until_date
Download only data until this month (YYYY-MM format).
If ``from_date`` is provided, defaults to the current month.
publisher
Filter by publisher:
oncae
Oficina Normativa de Contratación y Adquisiciones del Estado
sefin
Secretaria de Finanzas de Honduras
system
Filter by oncae system:
CE
Catálogo Electrónico
DDC
Módulo de Difusión Directa de Contratos
HC1
HonduCompras 1.0 (Módulo de Difusión de Compras y Contrataciones)
Bulk download documentation
http://www.contratacionesabiertas.gob.hn/descargas/
"""
name = "honduras_portal_bulk"
# BaseSpider
date_format = "year-month"
default_from_date = "2005-11"
skip_pluck = "Already covered (see code for details)" # honduras_portal_api_releases
# SimpleSpider
data_type = "release_package"
# Local
available_publishers = {
"oncae": "Oficina Normativa de Contratación y Adquisiciones del Estado (ONCAE) / Honduras",
"sefin": "Secretaria de Finanzas de Honduras",
}
available_systems = {
"HC1": "HonduCompras 1.0 - Módulo de Difusión de Compras y Contrataciones",
"CE": "Catálogo Electrónico",
"DDC": "Módulo de Difusión Directa de Contratos",
}
async def start(self):
yield scrapy.Request(
"http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json", callback=self.parse_list
)
@classmethod
def from_crawler(cls, crawler, publisher=None, system=None, *args, **kwargs):
spider = super().from_crawler(crawler, *args, publisher=publisher, system=system, **kwargs)
if publisher and spider.publisher not in spider.available_publishers:
raise SpiderArgumentError(f"spider argument `publisher`: {spider.publisher!r} not recognized")
if system:
if spider.publisher != "oncae":
raise SpiderArgumentError(
f"spider argument `system` is not supported for publisher: {spider.publisher!r}"
)
if spider.system not in spider.available_systems:
raise SpiderArgumentError(f"spider argument `system`: {spider.system!r} not recognized")
return spider
def parse_list(self, response):
"""
The response looks like:
[
{
"urls": {
"csv": "...",
"md5": "...",
"json": "...",
"xlsx": "..."
},
"year": "values between 2005 to the current year",
"month": "values between 1 and 12",
"sistema": "values from available_system",
"publicador": "values from available_publishers"
}, ...
]
"""
formatter = components(-1)
for item in response.json():
publisher = item["publicador"]
if self.publisher and publisher != self.available_publishers.get(self.publisher):
continue
if publisher == self.available_publishers["oncae"]:
system = item["sistema"]
if self.system and system != self.available_systems.get(self.system):
continue
if self.from_date and self.until_date:
date = datetime.datetime(int(item["year"]), int(item["month"]), 1, tzinfo=datetime.timezone.utc)
if not (self.from_date <= date <= self.until_date):
continue
yield self.build_request(item["urls"]["json"], formatter=formatter)