Source code for kingfisher_scrapy.spiders.honduras_oncae

from kingfisher_scrapy.base_spiders import CompressedFileSpider, PeriodicSpider
from kingfisher_scrapy.exceptions import SpiderArgumentError
from kingfisher_scrapy.util import MAX_DOWNLOAD_TIMEOUT, components


[docs] class HondurasONCAE(CompressedFileSpider, PeriodicSpider): """ Domain Oficina Normativa de Contratación y Adquisiciones del Estado (ONCAE) Spider arguments from_date Download only data from this year onward (YYYY format). Defaults to '2005'. until_date Download only data until this year (YYYY format). Defaults to the current year. system Filter by system: CE Catálogo Electrónico DDC Módulo de Difusión Directa de Contratos HC1 HonduCompras 1.0 (Módulo de Difusión de Compras y Contrataciones) Bulk download documentation https://oncae.gob.hn/datos-abiertos/ """ name = "honduras_oncae" custom_settings = { "DOWNLOAD_TIMEOUT": MAX_DOWNLOAD_TIMEOUT / 2, # 15min } # BaseSpider date_format = "year" default_from_date = "2005" skip_pluck = "Already covered (see code for details)" # honduras_portal_api_releases # SimpleSpider data_type = "release_package" # CompressedFileSpider yield_non_archive_file = True # PeriodicSpider pattern = "https://datosabiertos.oncae.gob.hn/datosabiertos/{}" formatter = staticmethod(components(-1)) # year # Local available_systems = {"HC1": 2005, "CE": 2014, "DDC": 2010} @classmethod def from_crawler(cls, crawler, system=None, *args, **kwargs): spider = super().from_crawler(crawler, *args, system=system, **kwargs) if system and spider.system not in spider.available_systems: raise SpiderArgumentError(f"spider argument `system`: {spider.system!r} not recognized") return spider def build_urls(self, date): systems = self.available_systems for system in systems: if self.system and system != self.system: continue if date < systems[system] or (system == "DDC" and date > 2019): continue suffix = f"{date}.json" if system == "HC1" else f"{date}_json.zip" yield self.pattern.format(f"{system}/{system}_datos_{suffix}")