Source code for kingfisher_scrapy.spiders.peru_compras

import scrapy

from kingfisher_scrapy.base_spiders import SimpleSpider
from kingfisher_scrapy.util import handle_http_error, parameters



[docs]
class PeruCompras(SimpleSpider):
    """
    Domain
      Peru Compras (contracts within framework agreements)
    Caveats
        The JSON data sometimes contains unescaped newline characters within strings.
    Spider arguments
      from_date
        Download only data from this date onward (YYYY-MM-DD format). Defaults to '2017-01-01'.
      until_date
        Download only data until this date (YYYY-MM-DD format). Defaults to today.
    """

    name = 'peru_compras'

    # BaseSpider
    date_required = True
    default_from_date = '2017-01-01'

    # SimpleSpider
    data_type = 'release_package'

    # Local
    url_prefix = 'https://www.catalogos.perucompras.gob.pe/ConsultaOrdenesPub/'

    def start_requests(self):
        url = f'{self.url_prefix}obtenerFiltros'
        yield scrapy.Request(url, meta={'file_name': 'list.html'}, callback=self.parse_list)

    @handle_http_error
    def parse_list(self, response):
        from_date = self.from_date.strftime(self.date_format)
        until_date = self.until_date.strftime(self.date_format)
        # The response is a large text that looks like list_1¯list_2¯list_3
        str_lists = response.text.split('¯')
        # where the first list is the framework agreements list that we need for querying the API
        # and the items in that list are separated by ¬
        frameworks = str_lists[0].split('¬')
        for framework in frameworks:
            # Each item has the format id-type^description and we need the id for querying the API
            # e.g.: 130-BIENES^IM-CE-2020-9 MATERIAL MÉDICO ¬128-BIENES^IM-CE-2020-8 DISPOSITIVO MÉDICO IN VITRO ¬
            framework_id = framework.split('-')[0]
            if framework_id:
                yield self.build_request(
                    f'{self.url_prefix}DescargaJsonOCDS'
                    f'?pAcuerdo={framework_id}&pFechaIni={from_date}&pFechaFin={until_date}',
                    formatter=parameters('pAcuerdo')
                )

    @handle_http_error
    def parse(self, response):
        # Replace unescaped newline characters within strings with a space.
        response = response.replace(body=response.body.replace(b'\n', b' '))
        yield from super().parse(response)