Source code for kingfisher_scrapy.spiders.moldova

import scrapy

from kingfisher_scrapy.base_spiders import SimpleSpider
from kingfisher_scrapy.util import components, handle_http_error, join, parameters, replace_parameters

[docs]class Moldova(SimpleSpider): """ Domain MTender Spider arguments from_date Download only data from this time onward (YYYY-MM-DDThh:mm:ss format). """ name = 'moldova' # BaseSpider date_format = 'datetime' # SimpleSpider data_type = 'release_package' def start_requests(self): # offers three endpoints: /tenders/, /tenders/plan/ and /budgets/. However, this # service publishes contracting processes under multiple OCIDs. # # The service instead publishes contracting processes under one OCID. # However, it has no endpoint to list OCIDs. # # As such, we retrieve OCIDs from the first, and data from the second. # # Note: The OCIDs from the /budgets/ endpoint have no corresponding data in the second service. The OCIDs from # the /tenders/plan/ endpoint are the same as from the /tenders/ endpoint. url = '' if self.from_date: url = f'{url}?offset={self.from_date.strftime(self.date_format)}' yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list) @handle_http_error def parse_list(self, response): base_url = '' data = response.json() # The last page returns an empty JSON object. if not data: return # Occasional error response with HTTP 200 code, e.g.: # { # "message": "connect EHOSTUNREACH", # "name": "Error", # "stack": "Error: connect EHOSTUNREACH\n at TCPConnectWrap.afterConnect...", # "config": { # "url": "", "method": "get", # "headers": {"Accept": "application/json, text/plain, */*", "User-Agent": "axios/0.21.1"}, # "transformRequest": [null], "transformResponse": [null], "timeout": 0, # "xsrfCookieName": "XSRF-TOKEN", "xsrfHeaderName": "X-XSRF-TOKEN", "maxContentLength": -1, # "maxBodyLength": -1}, "code": "EHOSTUNREACH" # } if 'name' in data and data['name'] == 'Error': data['http_code'] = response.status yield self.build_file_error_from_response(response, errors=data) return for item in data['data']: url = replace_parameters(base_url, offset=None) + item['ocid'] yield self.build_request(url, formatter=components(-2)) url = replace_parameters(response.request.url, offset=data['offset']) yield self.build_request(url, formatter=join(components(-1), parameters('offset')), callback=self.parse_list)