Source code for kingfisher_scrapy.spiders.honduras_iaip

import scrapy

from kingfisher_scrapy.base_spiders import SimpleSpider
from kingfisher_scrapy.exceptions import SpiderArgumentError
from kingfisher_scrapy.util import components


[docs] class HondurasIAIP(SimpleSpider): """ Domain Instituto de Acceso a la Información Publica (IAIP) Spider arguments portal Filter by portal: covid19 IAIP Emergencia Covid-19 huracanes Emergencia Huracán ETA oficio Portal Único de Transparencia Bulk download documentation https://portalunico.iaip.gob.hn/datosabierto/ """ name = "honduras_iaip" # SimpleSpider data_type = "release_package" # Local available_portals = ["covid19", "huracanes", "oficio"] @classmethod def from_crawler(cls, crawler, portal=None, *args, **kwargs): spider = super().from_crawler(crawler, *args, portal=portal, **kwargs) if portal and spider.portal not in spider.available_portals: raise SpiderArgumentError(f"spider argument `portal`: {spider.portal!r} not recognized") return spider async def start(self): yield scrapy.Request( "https://www.contratacionesabiertas.gob.hn/api/v1/iaip_datosabiertos/?format=json", callback=self.parse_list, ) def parse_list(self, response): for portal in self.available_portals: if self.portal and self.portal != portal: continue # Each portal is an array of objects with the filename and its CSV, Excel and JSON URL representations: # # "portal": [ {"nombreArchivo": "name", "excel": "URL", "csv": "URL", "json": "URL"} ] for item in response.json()[portal]: url = item["json"] # Retrieve URLs for packages of individual releases, not of compiled releases. if "compiled" not in url.lower(): yield self.build_request(url, formatter=components(-1))