Source code for kingfisher_scrapy.spiders.peru_compras

import scrapy

from kingfisher_scrapy.base_spiders import SimpleSpider
from kingfisher_scrapy.util import handle_http_error, parameters


[docs] class PeruCompras(SimpleSpider): """ Domain Peru Compras (contracts within framework agreements) Caveats The JSON data sometimes contains unescaped newline characters within strings. Spider arguments from_date Download only data from this date onward (YYYY-MM-DD format). Defaults to '2017-01-01'. until_date Download only data until this date (YYYY-MM-DD format). Defaults to today. """ name = 'peru_compras' # BaseSpider date_required = True default_from_date = '2017-01-01' # SimpleSpider data_type = 'release_package' # Local url_prefix = 'https://www.catalogos.perucompras.gob.pe/ConsultaOrdenesPub/' def start_requests(self): url = f'{self.url_prefix}obtenerFiltros' yield scrapy.Request(url, meta={'file_name': 'list.html'}, callback=self.parse_list) @handle_http_error def parse_list(self, response): from_date = self.from_date.strftime(self.date_format) until_date = self.until_date.strftime(self.date_format) # The response is a large text that looks like list_1¯list_2¯list_3 str_lists = response.text.split('¯') # where the first list is the framework agreements list that we need for querying the API # and the items in that list are separated by ¬ frameworks = str_lists[0].split('¬') for framework in frameworks: # Each item has the format id-type^description and we need the id for querying the API # e.g.: 130-BIENES^IM-CE-2020-9 MATERIAL MÉDICO ¬128-BIENES^IM-CE-2020-8 DISPOSITIVO MÉDICO IN VITRO ¬ framework_id = framework.split('-')[0] if framework_id: yield self.build_request( f'{self.url_prefix}DescargaJsonOCDS' f'?pAcuerdo={framework_id}&pFechaIni={from_date}&pFechaFin={until_date}', formatter=parameters('pAcuerdo') ) @handle_http_error def parse(self, response): # Replace unescaped newline characters within strings with a space. response = response.replace(body=response.body.replace(b'\n', b' ')) yield from super().parse(response)