Source code for renewal_recsystem.articles

"""
Implements the `~renewal_recsystem.articles.ArticleCollection` class, for
maintaining a size-limited cache of articles known by the system.

Recsystems can use this if they like, but may also replace it with a more
sophisticated data store, such as a database, for maintaining a collection of
articles sent to the system.
"""


import bisect


[docs]class ArticleCollection:
    r"""
    Maintain a list of article objects sorted by article_id (ascending).

    Articles are currently just represented as `dict`\s and ``'article_id'`` is
    their only required key.

    .. note::

        It might be a good idea to also feature validation of articles against
        a public schema.
    """

    MAX_ARTICLES = 10000
    """Maximum number of articles to keep cached in memory."""

    def __init__(self, initial=None, max_size=None):
        self.article_ids = []
        self.articles = {}
        self.max_size = max_size or self.MAX_ARTICLES
        if initial:
            for item in initial:
                id_ = item['article_id']
                if id_ not in self.articles:
                    self.article_ids.append(id_)
                    self.articles[id_] = item

            self.article_ids = sorted(self.article_ids)
            # Limit to the max_size highest article IDs
            self.article_ids = self.article_ids[-self.max_size:]

    def __len__(self):
        return len(self.article_ids)

    def __contains__(self, article_id):
        """
        Returns `True` if an article with a given article_id is in the
        collection.

        Examples
        --------

        >>> from renewal_recsystem.articles import ArticleCollection
        >>> articles = ArticleCollection([{'article_id': 2}])
        >>> 2 in articles
        True
        >>> 1 in articles
        False
        """

        return article_id in self.articles

    def __iter__(self):
        """
        Iterate over all articles in descending order of ``article_id``.

        Example
        -------
        >>> from renewal_recsystem.articles import ArticleCollection
        >>> articles = ArticleCollection([
        ...     {'article_id': id_} for id_ in range(1, 4)])
        ...
        >>> for article in articles:
        ...     print(article)
        ...
        {'article_id': 3}
        {'article_id': 2}
        {'article_id': 1}
        """

        for article_id in reversed(self.article_ids):
            yield self.articles[article_id]

    def __getitem__(self, article_id):
        """
        Retrieve items from the collection by article_id or a range of
        article_ids.

        Examples
        --------

        >>> from renewal_recsystem.articles import ArticleCollection
        >>> articles = ArticleCollection([
        ...     {'article_id': id_} for id_ in range(1, 10, 2)])
        ...
        >>> len(articles)
        5

        Articles are looked up by their article_id, not their position in the
        collection (which may have gaps between article_ids):

        >>> articles[1]
        {'article_id': 1}
        >>> articles[3]
        {'article_id': 3}

        If the article_id is not in the collection and `IndexError` is raised:

        >>> articles[2]
        Traceback (most recent call last):
        ...
        IndexError: 2

        Slicing an `ArticleCollection` returns a new `ArticleCollection`
        containing only the range of IDs specified in the slice:

        >>> articles[:]
        <renewal_recsystem.articles.ArticleCollection object at 0x...>

        When slicing the start index is inclusive and contains all articles
        with greater than or equal article_ids:

        >>> list(articles[5:])
        [{'article_id': 9}, {'article_id': 7}, {'article_id': 5}]

        If the start index is not in the collection, the same rule still
        applies--all articles with greater than or equal article_ids are
        returned:

        >>> list(articles[6:])
        [{'article_id': 9}, {'article_id': 7}]

        This includes if there are no such articles; this is consistent with
        the rules for slicing lists:

        >>> list(articles[10000:])
        []

        Also like lists, the end of a slice is *exclusive*--it ensures all
        articles with article_id less than the end point are returned:

        >>> list(articles[:5])
        [{'article_id': 3}, {'article_id': 1}]
        >>> list(articles[:6])
        [{'article_id': 5}, {'article_id': 3}, {'article_id': 1}]
        >>> list(articles[:1])
        []
        """

        if not isinstance(article_id, slice):
            # The single article case is simple.
            try:
                return self.articles[article_id]
            except KeyError:
                raise IndexError(article_id)

        # Select ranges of article IDs--this can be tricky because although
        # self.article_ids is assumed to be sorted, it have missing items in
        # the range
        slc = article_id
        start = slc.start
        stop = slc.stop

        if start is not None:
            start = bisect.bisect_left(self.article_ids, start)

        if stop is not None:
            # reverse enumerate
            stop = bisect.bisect_left(self.article_ids, stop)

        ids = self.article_ids[start:stop:slc.step]

        return self.__class__([self.articles[id_] for id_ in ids])

[docs]    def push(self, item):
        """
        Push a new article to the collection while maintaining the sort
        invariant.

        If the new article is already than the lowest article ID and the
        collection is already at capacity, it is discarded.

        Examples
        --------

        >>> from renewal_recsystem.articles import ArticleCollection
        >>> articles = ArticleCollection(max_size=2)
        >>> articles.push({'article_id': 2})
        >>> len(articles)
        1
        >>> articles[2]
        {'article_id': 2}

        If the article_id already exists this push is ignored (it does not
        merge with or overwrite the existing article):

        >>> articles.push({'article_id': 2, 'extra': True})
        >>> len(articles)
        1
        >>> articles[2]
        {'article_id': 2}

        If the collection is at capacity, the article with the lowest
        article_id is dropped on the next push:

        >>> articles.push({'article_id': 3})
        >>> articles.push({'article_id': 4})
        >>> len(articles)
        2
        >>> 2 in articles
        False

        Unless the new article is already below the lowest article_id, then the
        push is ignored:

        >>> articles.push({'article_id': 1})
        >>> len(articles)
        2
        >>> 1 in articles
        False
        >>> articles.article_ids
        [3, 4]
        """

        id_ = item['article_id']
        if (id_ in self.articles or
                (len(self.article_ids) == self.max_size and
                    id_ < self.article_ids[0])):
            return

        bisect.insort_left(self.article_ids, id_)
        self.articles[id_] = item

        if len(self.article_ids) > self.max_size:
            old_id = self.article_ids.pop(0)
            del self.articles[old_id]

        self.articles[id_] = item
Source code for renewal_recsystem.articles

Renewal Recsystems

Navigation

Related Topics