Source code for kingfisher_scrapy.spiders.colombia_bulk

import scrapy

from kingfisher_scrapy.base_spiders import CompressedFileSpider
from kingfisher_scrapy.exceptions import SpiderArgumentError
from kingfisher_scrapy.util import components, handle_http_error

[docs]class ColombiaBulk(CompressedFileSpider): """ Domain Colombia Compra Eficiente (CCE) Spider arguments from_date Download only data from this year onward (YYYY format). Defaults to '2011'. Only supported for 'SECOP1' system. until_date Download only data until this year (YYYY format). Defaults to the current year. Only supported for 'SECOP1' system. system Filter by system: SECOP1 Sistema Electrónico para la Contratación Pública (todos los procesos de contratación) SECOP2 Sistema Electrónico para la Contratación Pública (todos los procesos de contratación) TVEC Tienda Virtual del Estado Colombiano (acuerdos marco, agregación de demanda, mínima cuantía) Bulk download documentation """ name = 'colombia_bulk' download_timeout = 99999 custom_settings = { 'DOWNLOAD_FAIL_ON_DATALOSS': False, } # BaseSpider date_format = 'year' default_from_date = '2011' line_delimited = True root_path = 'Release' skip_pluck = 'Already covered (see code for details)' # colombia # SimpleSpider data_type = 'release' encoding = 'iso-8859-1' # Local available_systems = {'SECOP1': 'SI', 'SECOP2': 'SECOP2', 'TVEC': 'TVEC'} @classmethod def from_crawler(cls, crawler, system=None, *args, **kwargs): spider = super().from_crawler(crawler, system=system, *args, **kwargs) if system and spider.system not in spider.available_systems: raise SpiderArgumentError(f'spider argument `system`: {spider.system!r} not recognized') if spider.from_date and spider.until_date and spider.system != 'SECOP1': raise SpiderArgumentError('spider date arguments are only supported for SECOP1 system') return spider def start_requests(self): yield scrapy.Request( '', meta={'file_name': 'list.html'}, callback=self.parse_list, ) @handle_http_error def parse_list(self, response): urls = response.xpath('//a[@class="enlaces_contenido"]/@href').getall() for url in urls: if self.system: if self.available_systems[self.system] not in url: continue if self.from_date and self.until_date: # URL looks like year = int(url[-8:-4]) if not (self.from_date.year <= year <= self.until_date.year): continue yield self.build_request(url, formatter=components(-1))