Source code for kingfisher_scrapy.spiders.pakistan_ppra_api

import json

import scrapy

from kingfisher_scrapy.base_spiders import SimpleSpider
from kingfisher_scrapy.util import components, handle_http_error



[docs]
class PakistanPPRAAPI(SimpleSpider):
    """
    Domain
      Pakistan Public Procurement Regulatory Authority (PPRA)
    API documentation
      https://www.ppra.org.pk/api/
    """
    name = 'pakistan_ppra_api'

    # BaseSpider
    validate_json = True
    skip_pluck = 'Already covered (see code for details)'  # pakistan_ppra_bulk

    # SimpleSpider
    data_type = 'release_package'

    def start_requests(self):
        yield scrapy.Request(
            'https://www.ppra.org.pk/api/index.php/api/records',
            meta={'file_name': 'list.html'},
            callback=self.parse_list
        )

    @handle_http_error
    def parse_list(self, response):
        # remove the last item in the list to fix the str JSON format
        urls = json.loads(response.xpath('//body//text()').getall()[6].replace(",\r\n\r\nhttps://www.ppra.org.pk", ""))
        for url in urls:
            yield self.build_request(url, formatter=components(-2))