Source code for kingfisher_scrapy.base_spiders.periodic_spider

from kingfisher_scrapy import util
from kingfisher_scrapy.base_spiders import SimpleSpider


[docs] class PeriodicSpider(SimpleSpider): """ This class makes it easy to collect data from an API that accepts a year, year-month or date as a query string parameter or URL path component. #. Inherit from ``PeriodicSpider`` #. Set a ``date_format`` class attribute to "year", "year-month" or "date" #. Set a ``pattern`` class attribute to a URL pattern, with placeholders. If the ``date_format`` is "year", then a year is passed to the placeholder as an ``int``. If the ``date_format`` is "year-month", then the first day of the month is passed to the placeholder as a ``date``, which you can format as, for example: .. code-block: python pattern = 'http://comprasestatales.gub.uy/ocds/rss/{0:%Y}/{0:%m}' If the ``date_format`` is "date", then a 2-tuple of ``date`` objects in intervals of ``step`` days is passed to the placeholder, which you can format as, for example: .. code-block: python pattern = 'https://api.dgcp.gob.do/api/date/{0:%Y-%m-%d}/{1:%Y-%m-%d}/1' #. Set a ``formatter`` class attribute to set the file name like in :meth:`~kingfisher_scrapy.base_spiders.BaseSpider.build_request` #. Set a ``default_from_date`` class attribute to a year ("YYYY") or year-month ("YYYY-MM") #. If the source stopped publishing, set a ``default_until_date`` class attribute to a year or year-month #. Optionally, if the ``date_format`` is "date", set a ``step`` class attribute to indicate the length of intervals, in days – otherwise, it defaults to 1 #. Optionally, set a ``start_requests_callback`` class attribute to a method's name as a string - otherwise, it defaults to :meth:`~kingfisher_scrapy.base_spiders.simple_spider.SimpleSpider.parse` If ``sample`` is set, the data from the most recent year or month is retrieved. """ # PeriodicSpider requires date parameters to be always set. date_required = True start_requests_callback = 'parse' # Length of intervals, if `date_format` is "date". step = 1 def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.start_requests_callback = getattr(self, self.start_requests_callback)
[docs] def start_requests(self): start = self.from_date stop = self.until_date if self.date_format == '%Y': date_range = util.date_range_by_year(start.year, stop.year) elif self.date_format == '%Y-%m': date_range = util.date_range_by_month(start, stop) else: date_range = util.date_range_by_interval(start, stop, self.step) for date in date_range: args = date if self.date_format == '%Y-%m-%d' else [date] for number, url in enumerate(self.build_urls(*args)): yield self.build_request( url, formatter=self.formatter, callback=self.start_requests_callback, priority=number * -1 )
[docs] def build_urls(self, from_date, until_date=None): """ Yields one or more URLs for the given date. """ if until_date: yield self.pattern.format(from_date, until_date) else: yield self.pattern.format(from_date)