Source code for kingfisher_scrapy.spiders.moldova_multi_record

import scrapy

from kingfisher_scrapy.base_spiders import BaseSpider
from kingfisher_scrapy.exceptions import RetryableError
from kingfisher_scrapy.util import components, replace_parameters


[docs] class MoldovaMultiRecord(BaseSpider): """ Domain MTender Caveats The ``https://public.mtender.gov.md/tenders/{ocid}`` endpoint returns a record package in which each record has a different ``ocid`` value (as expected), but these actually represent the same contracting process (not expected). To fix this, we reformat the record package as a release package, using each record's ``compiledRelease`` as an individual release, and replacing the release's ``ocid`` value with the OCID from the URL. The compliant OCDS endpoint ``http://public.eprocurement.systems/ocds/tenders/{ocid}`` returns error messages. Spider arguments from_date Download only data from this time onward (YYYY-MM-DDThh:mm:ss format). """ name = "moldova_multi_record" # BaseSpider date_format = "datetime" # Local base_url = "https://public.mtender.gov.md/tenders/" async def start(self): if self.from_date: url = f"{self.base_url}?offset={self.from_date.strftime(self.date_format)}" else: url = self.base_url yield scrapy.Request(url, callback=self.parse_list) def load_json_or_retry_error(self, response): r""" Retry an HTTP 200 response if its body is empty or describes an error, like: { "message": "connect EHOSTUNREACH 185.108.182.236:443", "name": "Error", "stack": "Error: connect EHOSTUNREACH 185.108.182.236:443\n at TCPConnectWrap.afterConnect...", "config": { "url": "https://public.mtender.gov.md/tenders/ocds-b3wdp1-MD-1603913785143", "method": "get", "headers": { "Accept": "application/json, text/plain, */*", "User-Agent": "axios/0.21.1" }, "transformRequest": [ null ], "transformResponse": [ null ], "timeout": 0, "xsrfCookieName": "XSRF-TOKEN", "xsrfHeaderName": "X-XSRF-TOKEN", "maxContentLength": -1, "maxBodyLength": -1 }, "code": "EHOSTUNREACH" } """ if not response.body: raise RetryableError data = response.json() if data.get("name") == "Error": raise RetryableError return data def parse_list(self, response): data = self.load_json_or_retry_error(response) # The last page returns an empty JSON object. if not data: return for item in data["data"]: yield self.build_request(f"{self.base_url}{item['ocid']}", formatter=components(-1)) yield scrapy.Request(replace_parameters(response.request.url, offset=data["offset"]), callback=self.parse_list) def parse(self, response): data = self.load_json_or_retry_error(response) ocid = components(-1)(response.request.url) releases = [] for record in data.pop("records"): release = record["compiledRelease"] release["ocid"] = ocid releases.append(release) yield self.build_file_from_response(response, data_type="release_package", data=data | {"releases": releases})