Source code for renewal_recsystem.articles
"""
Implements the `~renewal_recsystem.articles.ArticleCollection` class, for
maintaining a size-limited cache of articles known by the system.
Recsystems can use this if they like, but may also replace it with a more
sophisticated data store, such as a database, for maintaining a collection of
articles sent to the system.
"""
import bisect
[docs]class ArticleCollection:
r"""
Maintain a list of article objects sorted by article_id (ascending).
Articles are currently just represented as `dict`\s and ``'article_id'`` is
their only required key.
.. note::
It might be a good idea to also feature validation of articles against
a public schema.
"""
MAX_ARTICLES = 10000
"""Maximum number of articles to keep cached in memory."""
def __init__(self, initial=None, max_size=None):
self.article_ids = []
self.articles = {}
self.max_size = max_size or self.MAX_ARTICLES
if initial:
for item in initial:
id_ = item['article_id']
if id_ not in self.articles:
self.article_ids.append(id_)
self.articles[id_] = item
self.article_ids = sorted(self.article_ids)
# Limit to the max_size highest article IDs
self.article_ids = self.article_ids[-self.max_size:]
def __len__(self):
return len(self.article_ids)
def __contains__(self, article_id):
"""
Returns `True` if an article with a given article_id is in the
collection.
Examples
--------
>>> from renewal_recsystem.articles import ArticleCollection
>>> articles = ArticleCollection([{'article_id': 2}])
>>> 2 in articles
True
>>> 1 in articles
False
"""
return article_id in self.articles
def __iter__(self):
"""
Iterate over all articles in descending order of ``article_id``.
Example
-------
>>> from renewal_recsystem.articles import ArticleCollection
>>> articles = ArticleCollection([
... {'article_id': id_} for id_ in range(1, 4)])
...
>>> for article in articles:
... print(article)
...
{'article_id': 3}
{'article_id': 2}
{'article_id': 1}
"""
for article_id in reversed(self.article_ids):
yield self.articles[article_id]
def __getitem__(self, article_id):
"""
Retrieve items from the collection by article_id or a range of
article_ids.
Examples
--------
>>> from renewal_recsystem.articles import ArticleCollection
>>> articles = ArticleCollection([
... {'article_id': id_} for id_ in range(1, 10, 2)])
...
>>> len(articles)
5
Articles are looked up by their article_id, not their position in the
collection (which may have gaps between article_ids):
>>> articles[1]
{'article_id': 1}
>>> articles[3]
{'article_id': 3}
If the article_id is not in the collection and `IndexError` is raised:
>>> articles[2]
Traceback (most recent call last):
...
IndexError: 2
Slicing an `ArticleCollection` returns a new `ArticleCollection`
containing only the range of IDs specified in the slice:
>>> articles[:]
<renewal_recsystem.articles.ArticleCollection object at 0x...>
When slicing the start index is inclusive and contains all articles
with greater than or equal article_ids:
>>> list(articles[5:])
[{'article_id': 9}, {'article_id': 7}, {'article_id': 5}]
If the start index is not in the collection, the same rule still
applies--all articles with greater than or equal article_ids are
returned:
>>> list(articles[6:])
[{'article_id': 9}, {'article_id': 7}]
This includes if there are no such articles; this is consistent with
the rules for slicing lists:
>>> list(articles[10000:])
[]
Also like lists, the end of a slice is *exclusive*--it ensures all
articles with article_id less than the end point are returned:
>>> list(articles[:5])
[{'article_id': 3}, {'article_id': 1}]
>>> list(articles[:6])
[{'article_id': 5}, {'article_id': 3}, {'article_id': 1}]
>>> list(articles[:1])
[]
"""
if not isinstance(article_id, slice):
# The single article case is simple.
try:
return self.articles[article_id]
except KeyError:
raise IndexError(article_id)
# Select ranges of article IDs--this can be tricky because although
# self.article_ids is assumed to be sorted, it have missing items in
# the range
slc = article_id
start = slc.start
stop = slc.stop
if start is not None:
start = bisect.bisect_left(self.article_ids, start)
if stop is not None:
# reverse enumerate
stop = bisect.bisect_left(self.article_ids, stop)
ids = self.article_ids[start:stop:slc.step]
return self.__class__([self.articles[id_] for id_ in ids])
[docs] def push(self, item):
"""
Push a new article to the collection while maintaining the sort
invariant.
If the new article is already than the lowest article ID and the
collection is already at capacity, it is discarded.
Examples
--------
>>> from renewal_recsystem.articles import ArticleCollection
>>> articles = ArticleCollection(max_size=2)
>>> articles.push({'article_id': 2})
>>> len(articles)
1
>>> articles[2]
{'article_id': 2}
If the article_id already exists this push is ignored (it does not
merge with or overwrite the existing article):
>>> articles.push({'article_id': 2, 'extra': True})
>>> len(articles)
1
>>> articles[2]
{'article_id': 2}
If the collection is at capacity, the article with the lowest
article_id is dropped on the next push:
>>> articles.push({'article_id': 3})
>>> articles.push({'article_id': 4})
>>> len(articles)
2
>>> 2 in articles
False
Unless the new article is already below the lowest article_id, then the
push is ignored:
>>> articles.push({'article_id': 1})
>>> len(articles)
2
>>> 1 in articles
False
>>> articles.article_ids
[3, 4]
"""
id_ = item['article_id']
if (id_ in self.articles or
(len(self.article_ids) == self.max_size and
id_ < self.article_ids[0])):
return
bisect.insort_left(self.article_ids, id_)
self.articles[id_] = item
if len(self.article_ids) > self.max_size:
old_id = self.article_ids.pop(0)
del self.articles[old_id]
self.articles[id_] = item