Source code for renewal_recsystem.articles

"""
Implements the `~renewal_recsystem.articles.ArticleCollection` class, for
maintaining a size-limited cache of articles known by the system.

Recsystems can use this if they like, but may also replace it with a more
sophisticated data store, such as a database, for maintaining a collection of
articles sent to the system.
"""


import bisect


[docs]class ArticleCollection: r""" Maintain a list of article objects sorted by article_id (ascending). Articles are currently just represented as `dict`\s and ``'article_id'`` is their only required key. .. note:: It might be a good idea to also feature validation of articles against a public schema. """ MAX_ARTICLES = 10000 """Maximum number of articles to keep cached in memory.""" def __init__(self, initial=None, max_size=None): self.article_ids = [] self.articles = {} self.max_size = max_size or self.MAX_ARTICLES if initial: for item in initial: id_ = item['article_id'] if id_ not in self.articles: self.article_ids.append(id_) self.articles[id_] = item self.article_ids = sorted(self.article_ids) # Limit to the max_size highest article IDs self.article_ids = self.article_ids[-self.max_size:] def __len__(self): return len(self.article_ids) def __contains__(self, article_id): """ Returns `True` if an article with a given article_id is in the collection. Examples -------- >>> from renewal_recsystem.articles import ArticleCollection >>> articles = ArticleCollection([{'article_id': 2}]) >>> 2 in articles True >>> 1 in articles False """ return article_id in self.articles def __iter__(self): """ Iterate over all articles in descending order of ``article_id``. Example ------- >>> from renewal_recsystem.articles import ArticleCollection >>> articles = ArticleCollection([ ... {'article_id': id_} for id_ in range(1, 4)]) ... >>> for article in articles: ... print(article) ... {'article_id': 3} {'article_id': 2} {'article_id': 1} """ for article_id in reversed(self.article_ids): yield self.articles[article_id] def __getitem__(self, article_id): """ Retrieve items from the collection by article_id or a range of article_ids. Examples -------- >>> from renewal_recsystem.articles import ArticleCollection >>> articles = ArticleCollection([ ... {'article_id': id_} for id_ in range(1, 10, 2)]) ... >>> len(articles) 5 Articles are looked up by their article_id, not their position in the collection (which may have gaps between article_ids): >>> articles[1] {'article_id': 1} >>> articles[3] {'article_id': 3} If the article_id is not in the collection and `IndexError` is raised: >>> articles[2] Traceback (most recent call last): ... IndexError: 2 Slicing an `ArticleCollection` returns a new `ArticleCollection` containing only the range of IDs specified in the slice: >>> articles[:] <renewal_recsystem.articles.ArticleCollection object at 0x...> When slicing the start index is inclusive and contains all articles with greater than or equal article_ids: >>> list(articles[5:]) [{'article_id': 9}, {'article_id': 7}, {'article_id': 5}] If the start index is not in the collection, the same rule still applies--all articles with greater than or equal article_ids are returned: >>> list(articles[6:]) [{'article_id': 9}, {'article_id': 7}] This includes if there are no such articles; this is consistent with the rules for slicing lists: >>> list(articles[10000:]) [] Also like lists, the end of a slice is *exclusive*--it ensures all articles with article_id less than the end point are returned: >>> list(articles[:5]) [{'article_id': 3}, {'article_id': 1}] >>> list(articles[:6]) [{'article_id': 5}, {'article_id': 3}, {'article_id': 1}] >>> list(articles[:1]) [] """ if not isinstance(article_id, slice): # The single article case is simple. try: return self.articles[article_id] except KeyError: raise IndexError(article_id) # Select ranges of article IDs--this can be tricky because although # self.article_ids is assumed to be sorted, it have missing items in # the range slc = article_id start = slc.start stop = slc.stop if start is not None: start = bisect.bisect_left(self.article_ids, start) if stop is not None: # reverse enumerate stop = bisect.bisect_left(self.article_ids, stop) ids = self.article_ids[start:stop:slc.step] return self.__class__([self.articles[id_] for id_ in ids])
[docs] def push(self, item): """ Push a new article to the collection while maintaining the sort invariant. If the new article is already than the lowest article ID and the collection is already at capacity, it is discarded. Examples -------- >>> from renewal_recsystem.articles import ArticleCollection >>> articles = ArticleCollection(max_size=2) >>> articles.push({'article_id': 2}) >>> len(articles) 1 >>> articles[2] {'article_id': 2} If the article_id already exists this push is ignored (it does not merge with or overwrite the existing article): >>> articles.push({'article_id': 2, 'extra': True}) >>> len(articles) 1 >>> articles[2] {'article_id': 2} If the collection is at capacity, the article with the lowest article_id is dropped on the next push: >>> articles.push({'article_id': 3}) >>> articles.push({'article_id': 4}) >>> len(articles) 2 >>> 2 in articles False Unless the new article is already below the lowest article_id, then the push is ignored: >>> articles.push({'article_id': 1}) >>> len(articles) 2 >>> 1 in articles False >>> articles.article_ids [3, 4] """ id_ = item['article_id'] if (id_ in self.articles or (len(self.article_ids) == self.max_size and id_ < self.article_ids[0])): return bisect.insort_left(self.article_ids, id_) self.articles[id_] = item if len(self.article_ids) > self.max_size: old_id = self.article_ids.pop(0) del self.articles[old_id] self.articles[id_] = item