Source code for kingfisher_scrapy.spiders.honduras_portal_bulk

from datetime import datetime

import scrapy

from kingfisher_scrapy.base_spiders import SimpleSpider
from kingfisher_scrapy.exceptions import SpiderArgumentError
from kingfisher_scrapy.util import components, handle_http_error


[docs]class HondurasPortalBulk(SimpleSpider): """ Domain Oficina Normativa de Contratación y Adquisiciones del Estado (ONCAE) / Secretaria de Finanzas de Honduras (SEFIN) Spider arguments from_date Download only data from this month onward (YYYY-MM format). If ``until_date`` is provided, defaults to '2005-11'. until_date Download only data until this month (YYYY-MM format). If ``from_date`` is provided, defaults to the current month. publisher Filter by publisher: oncae Oficina Normativa de Contratación y Adquisiciones del Estado sefin Secretaria de Finanzas de Honduras system Filter by oncae system: CE Catálogo Electrónico DDC Módulo de Difusión Directa de Contratos HC1 HonduCompras 1.0 (Módulo de Difusión de Compras y Contrataciones) Bulk download documentation http://www.contratacionesabiertas.gob.hn/descargas/ """ name = 'honduras_portal_bulk' # BaseSpider date_format = 'year-month' default_from_date = '2005-11' skip_pluck = 'Already covered (see code for details)' # honduras_portal_releases # SimpleSpider data_type = 'release_package' # Local available_publishers = { 'oncae': 'Oficina Normativa de Contratación y Adquisiciones del Estado (ONCAE) / Honduras', 'sefin': 'Secretaria de Finanzas de Honduras', } available_systems = { 'HC1': 'HonduCompras 1.0 - Módulo de Difusión de Compras y Contrataciones', 'CE': 'Catálogo Electrónico', 'DDC': 'Módulo de Difusión Directa de Contratos', } def start_requests(self): url = 'http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json' yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list) @classmethod def from_crawler(cls, crawler, publisher=None, system=None, *args, **kwargs): spider = super().from_crawler(crawler, publisher=publisher, system=system, *args, **kwargs) if publisher and spider.publisher not in spider.available_publishers: raise SpiderArgumentError(f'spider argument `publisher`: {spider.publisher!r} not recognized') if system: if spider.publisher != 'oncae': raise SpiderArgumentError(f'spider argument `system` is not supported for publisher: ' f'{spider.publisher!r}') if spider.system not in spider.available_systems: raise SpiderArgumentError(f'spider argument `system`: {spider.system!r} not recognized') return spider @handle_http_error def parse_list(self, response): formatter = components(-1) # An example of expected response is: # [ # { # "urls": { # "csv": "...", # "md5": "...", # "json": "...", # "xlsx": "..." # }, # "year": "values between 2005 to the current year", # "month": "values between 1 and 12", # "sistema": "values from available_system", # "publicador": "values from available_publishers" # }, ... # ] for item in response.json(): publisher = item['publicador'] if self.publisher and publisher != self.available_publishers.get(self.publisher): continue if publisher == self.available_publishers['oncae']: system = item['sistema'] if self.system and system != self.available_systems.get(self.system): continue if self.from_date and self.until_date: date = datetime(int(item['year']), int(item['month']), 1) if not (self.from_date <= date <= self.until_date): continue yield self.build_request(item['urls']['json'], formatter=formatter)