app/ranklist/ranker.py
author Pawel Solyga <Pawel.Solyga@gmail.com>
Thu, 24 Sep 2009 21:15:45 +0200
changeset 2972 1e1b8f22e153
parent 1597 e7b7ea113a3b
permissions -rwxr-xr-x
Style fixes in seed_db module.

#!/usr/bin/python
#
# Copyright 2008 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from google.appengine.api import datastore
from google.appengine.api import datastore_types

from common import transactional


class Ranker(object):
  """A data structure for storing integer scores and quickly retrieving their
  relative ranks.

  Scores are stored as "name: score" mapping. A score is inserted by
  calling SetScore with a new name. The score can later be updated by
  calling SetScore again with the same name.

  The scores are actually lists of integers with the same number of elements,
  and their ordering is lexicographic.  That is to say that score A is higher
  than score B if they are different, and the first element that differs between
  the two is higher in A.  Thus [5, 3] is ranked higher than [4, 99].

  Example Use Case:

  Some number of people are participating in a programming contest.  Solving a
  problem gets points; contestants also get a tie-breaking "penalty time."  The
  higher the time, the worse the score.

  STEP 1:
  # Creates the ranker when the contest is created:
  rank = ranker.Ranker.Create([0, 10000, -999999, 1], 100)
  # In our contest, people won't have more than 9999 points or 999999 penalty
  # seconds.  Since penalty time is bad, we store [score, -penalty].
  contest['ranker'] = rank.rootkey
  datastore.Put(contest)

  STEP 2:
  # Someone registers for the contest.  The default score is [0, 0], and for
  # efficiency we don't put them in the ranker; they won't be ahead of anyone
  # anyway.

  STEP 3:
  # Player "Jon" gets points for the first time!
  rank = ranker.Ranker(contest['ranker'])
  # Loads up the ranker.  This is the first step of all the STEPs below.
  rank.SetScore("Jon", [5, -120])  # 5 points, 2 minutes

  STEP 4:
  # Player "Jon" gets points for the second time!
  rank.SetScore("Jon", [10, -300])  # 10 points, 5 minutes

  STEP 5:
  # What is the rank of the person with score [10, -300]?
  position = rank.FindRank([10, 300])
  # What are the ranks for the people with scores a, b, c?
  positions = rank.FindRanks([a, b, c])

  STEP 6:
  # What is the score of the person ranked 20th?
  score = rank.FindScore(19)
  # This is particularly useful for seeing multiple pages of ranked people.  If
  # the scores are separately tracked in an entity called 'scores', the
  # datastore can efficiently answer:
  # q = datastore.Query('scores')
  # q['score <='] = rank.FindScore(1000)[0]
  # next_twenty_scores = q.Get(20)
  # This is a simplified case, where scores are a single integer.

  STEP 7:
  # How many people have points?
  num_pointy_people = rank.TotalRankedScores()



  Implementation Details:

  The Ranker is a rooted tree of 'ranker_node' entities.  It is an
  N-ary tree, where N = self.branching_factor, which is specified by the
  constructor.  Each node represents some range of scores, and is assigned a
  unique node_id.

  Take for example a 3-ary tree with point range [0, 10000, -36000, 1].  This
  could represent a contest where contestants can have between 0 and 9999
  points, with ties being broken by (negative) penalty seconds that can be
  between 0 and 10 hours.

  The root represents the score range [0, 10000, -36000, 1].  It has node_id 0.
  Its first child has node_id 1 and score range [0, 3333, -36000, 1].
  The root's second child, node_id 2, has score range [3333, 6666, -36000, 1],
  and its third child, node_id 3, has score range [6666, 10000, -36000, 1].
  Node 1's first child, node_id 4, has range [0, 1111, -36000, 1], and so on.
  See __WhichChild for details of how children are assigned score ranges.

  The point of a node is to track how many stored scores are in the score range
  of each of its children.  The root in the above example would start off with
  child_node_counts = [0, 0, 0]; adding score [4000, 0] would change the root's
  child_node_counts to [0, 1, 0] and node 5's child_node_counts to [1, 0, 0],
  and so forth.

  Ranker also has a "ranker_score" entity for every score stored in the ranker.
  These entities are part of the same entity group as the ranker_node
  entities. This allows for atomic, idempotent calls to SetScores.

  Ranker supports the following operations, which can be read about in detail
  in their docstrings:

  SetScores(scores): Set scores for multiple players.
  FindRank(score): Finds the 0-based rank of the provided score.
  FindScore(rank): Finds the score with the provided 0-based rank.
  FindScoreApproximate(rank): Finds a score >= the score of the provided 0-based
    rank, < the score of rank-1 (unless rank and rank-1 are tied, in which case
    it returns their mutual score).
  TotalRankedScores: The total number of scores in the Ranker.

  See __FindNodeIDs for more notes on structure.

  """

  def __init__(self, rootkey):
    """Pulls a ranker out of the datastore, given the key of the root node.

    Args:
      rootkey: The datastore key of the ranker.
    """
    # Get the root from the datastore:
    assert rootkey.kind() == "ranker"
    root = datastore.Get(rootkey)
    # Initialize some class variables:
    self.rootkey = rootkey
    self.score_range = root["score_range"]
    self.branching_factor = root["branching_factor"]
    # Sanity checking:
    assert len(self.score_range) > 1
    assert len(self.score_range) % 2 == 0
    for i in xrange(0, len(self.score_range), 2):
      assert self.score_range[i + 1] > self.score_range[i]
    assert self.branching_factor > 1

  @classmethod
  def Create(cls, score_range, branching_factor):
    """Constructs a new Ranker and returns it.

    Args:
      score_range: A list showing the range of valid scores, in the form:
        [most_significant_score_min, most_significant_score_max,
         less_significant_score_min, less_significant_score_max, ...]
        Ranges are [inclusive, exclusive)
      branching_factor: The branching factor of the tree.  The number of
        datastore Gets is Theta(1/log(branching_factor)), and the amount of data
        returned by each Get is Theta(branching_factor).

    Returns:
      A new Ranker.
    """
    # Put the root in the datastore:
    root = datastore.Entity("ranker")
    root["score_range"] = score_range
    root["branching_factor"] = branching_factor
    datastore.Put(root)
    myrank = Ranker(root.key())
    return myrank

  def __FindNodeIDs(self, score):
    """Finds the nodes along the path from the root to a certain score.

    Args:
      score: The score we're finding the path for.

    Returns:
      A sorted list of (node_id, child) tuples, indicating that node_id is the
      node id of a node on the path, and child is which child of that node is
      next.  Note that the lowest child node (which would be a leaf node) does
      not actually exist, since all its relevant information (number of times
      that score was inserted) is stored in its parent.

    Nodes are numbered row-by-row: the root is 0, its children are in the range
    [1, self.branching_factor + 1), its grandchildren are in the range
    [self.branching_factor + 1,
     self.branching_factor**2 + self.branching_factor + 1), etc.

    Score ranges are lists of the form: [min_0, max_0, min_1, max_1, ...]
    A node representing a score range will be divided up by the first index
    where max_i != min_i + 1 (score ranges are [inclusive, exclusive)).

    Child x (0-indexed) of a node [a,b) will get the range:
    [a+x*(b-a)/branching_factor, a+(x+1)*(b-a)/branching_factor);
    Thus not all nodes will have nonzero ranges.  Nodes with zero range will
    never be visited, but they and their descendants will be counted in the node
    numbering scheme, so row x still has self.branching_factor**x nodes.
    """
    nodes = []
    node = 0
    cur_range = list(self.score_range)
    # The current range of scores.  This will be narrowed as we move down the
    # tree; 'index' keeps track of the score type we're currently changing.
    for index in xrange(0, len(cur_range), 2):
      while cur_range[index + 1] - cur_range[index] > 1:
        # Subdivide cur_range[index]..cur_range[index + 1]
        which_child = self.__WhichChild(cur_range[index],
                                        cur_range[index + 1],
                                        score[index // 2],
                                        self.branching_factor)
        child = which_child[0]
        cur_range[index] = which_child[1][0]
        cur_range[index + 1] = which_child[1][1]
        assert 0 <= child < self.branching_factor
        nodes.append((node, child))
        node = self.__ChildNodeId(node, child)
    return nodes

  def __WhichChild(self, low, high, want, branching_factor):
    """Determines which child of the range [low, high) 'want' belongs to.

    Args:
      low: An int, the low end of the range.
      high: An int, the high end of the range.
      want: An int, the score we're trying to determine a child for.
      branching_factor: The branching factor of the tree being used.

    Returns:
      A tuple, (child, [child's score range]).  Note that in general a score
      has multiple sub-scores, written in order of decreasing significance; this
      function divides up a single sub-score.

    Raises:
      An AssertionError if things go horribly wrong.
    """
    assert low <= want < high
    # Need to find x such that (using integer division):
    #     x  *(high-low)/branching_factor <= want - low <
    #   (x+1)*(high-low)/branching_factor
    # Which is the least x such that (using integer division):
    #   want - low < (x+1)*(high-low)/branching_factor
    # Which is the ceiling of x such that (using floating point division):
    #   want - low + 1 == (x+1)*(high-low)/branching_factor
    #   x = -1 + math.ceil((want-low+1) * branching_factor / (high - low))
    # We get ceil by adding high - low - 1 to the numerator.
    x = -1 + (((want - low + 1) * branching_factor + high - low - 1) //
              (high - low))
    assert (x * (high - low) // branching_factor <=
            want - low < (x + 1) * (high - low) // branching_factor)
    return (x, self.__ChildScoreRange([low, high], x, branching_factor))

  def __ChildScoreRange(self, score_range, child, branching_factor):
    """Calculates the score_range for a node's child.

    Args:
      score_range: A score range [min0, max0, min1, max1, ...]
      child: Which child of the node with score range score_range we're
        calculating the score range of.
      branching_factor: The branching factor of the tree in question.

    Returns:
      A score range [min0', max0', min1', max1', ...] for that child.
    """
    for i in xrange(1, len(score_range), 2):
      if score_range[i] > score_range[i - 1] + 1:
        child_score_range = list(score_range)
        low, high = score_range[i - 1], score_range[i]
        child_score_range[i - 1], child_score_range[i] = (
            low + child * (high - low) // branching_factor,
            low + (child + 1) * (high - low) // branching_factor)
        return child_score_range
    raise AssertionError("Node with score range %s has no children." %
                         score_range)

  def __ChildNodeId(self, node_id, child):
    """Calculates the node id for a known node id's child.

    Args:
      node_id: The parent node's node_id
      child: Which child of the parent node we're finding the id for

    Returns:
      The node_id for the child'th child of node_id.
    """
    return node_id * self.branching_factor + 1 + child

  def __GetMultipleNodes(self, node_ids):
    """Gets multiple nodes from the datastore.

    Args:
      node_ids: A list of node ids we want to get.

    Returns:
      A dict of the nodes that were found, indexed by the node ids found
      in node_ids.
    """
    if len(node_ids) == 0:
      return []
    node_ids = set(node_ids)
    keys = [self.__KeyFromNodeId(node_id) for node_id in node_ids]
    nodes = datastore.Get(keys)
    return dict((node_id, node) for (node_id, node) in zip(node_ids, nodes)
                if node)

  # Although, this method is currently not needed, we'll keep this
  # since we might need it and some point and it's an interesting
  # relationship
  def __ParentNode(self, node_id):
    """Returns the node id of the parameter node id's parent.  Returns None if
    the parameter is 0."""
    if node_id == 0:
      return None
    return (node_id - 1) // self.branching_factor

  def __KeyFromNodeId(self, node_id):
    """Creates a (named) key for the node with a given id.

    The key will have the ranker as a parent element to guarantee
    uniqueness (in the presence of multiple rankers) and to put all
    nodes in a single entity group.

    Args:
      node_id: The node's id as an integer.

    Returns:
      A (named) key for the node with the id 'node_id'.
    """
    name = "node_%x" % node_id
    return datastore_types.Key.from_path("ranker_node", name,
                                         parent=self.rootkey)

  def __KeyForScore(self, name):
    """Returns a (named) key for a ranker_score entity.

    Args:
      name: Name of the score to create a key for.

    Returns:
      A (named) key for the entity storing the score of 'name'.
    """
    return datastore_types.Key.from_path("ranker_score", name,
                                         parent=self.rootkey)

  def __Increment(self, nodes_with_children, score_entities,
                  score_entities_to_delete):
    """Changes child counts for given nodes.

    This method will create nodes as needed.

    Args:
      nodes_with_children: A dict of (node_key, child) tuples to deltas
      score_entities: Additional score entities to persist as part of
        this transaction
    Returns:
      None
    """
    keys = list(set(key for ((key, _), delta) in nodes_with_children.iteritems()
                    if delta != 0))
    if not keys:
      return  # Nothing to do
    nodes = datastore.Get(keys)

    node_dict = {}
    for (key, node) in zip(keys, nodes):
      if not node:
        node = datastore.Entity("ranker_node", parent=self.rootkey,
                                name=key.name())
        node["child_counts"] = [0] * self.branching_factor
      node_dict[key] = node
    for ((key, child), amount) in nodes_with_children.iteritems():
      if amount != 0:
        node = node_dict[key]
        node["child_counts"][child] += amount
        assert node["child_counts"][child] >= 0
    datastore.Put(node_dict.values() + score_entities)
    if score_entities_to_delete:
      datastore.Delete(score_entities_to_delete)

  def SetScore(self, name, score):
    """Sets a single score.

    This is equivalent to calling 'SetScores({name: score})'

    Args:
      name: the name of the score as a string
      score: the score to set name to
    """
    return self.SetScores({name: score})

  @transactional
  def SetScores(self, scores):
    """Changes multiple scores atomically.

    Sets the scores of the named entities in scores to new values. For
    named entities that have not been registered with a score before,
    a new score is created. For named entities that already had a score,
    the score is changed to reflect the new score. If a score is None,
    the named entity's score will be removed from the ranker.

    Args:
      scores: A dict mapping entity names (strings) to scores (integer lists)
    """
    score_deltas, score_ents, score_ents_del = self.__ComputeScoreDeltas(scores)
    node_ids_to_deltas = self.__ComputeNodeModifications(score_deltas)
    self.__Increment(node_ids_to_deltas, score_ents, score_ents_del)

  def __ComputeScoreDeltas(self, scores):
    """Compute which scores have to be incremented and decremented.

    Args:
      scores: A dict mapping entity names to scores

    Returns:
      A tuple (score_deltas, score_entities, score_entities_to_delete).

      'score_deltas' is a dict, mapping scores (represented as tuples)
      to integers. 'score_deltas[s]' represents how many times the
      score 's' has to be incremented (or decremented).

      'score_entities' is a list of 'ranker_score' entities that have
      to be updated in the same transaction as modifying the ranker
      nodes. The entities already contain the updated score.

      Similarly, 'score_entities_to_delete' is a list of entities that
      have to be deleted in the same transaction as modifying the ranker
      nodes.
    """
    score_keys = [self.__KeyForScore(score) for score in scores]
    old_scores = {}
    for old_score in datastore.Get(score_keys):
      if old_score:
        old_scores[old_score.key().name()] = old_score
    score_deltas = {}
    # Score entities to update
    score_ents = []
    score_ents_del = []
    for score_name, score_value in scores.iteritems():
      if score_name in old_scores:
        score_ent = old_scores[score_name]
        if score_ent["value"] == score_value:
          continue  # No change in score => nothing to do
        old_score_key = tuple(score_ent["value"])
        score_deltas.setdefault(old_score_key, 0)
        score_deltas[old_score_key] -= 1
      else:
        score_ent = datastore.Entity("ranker_score", parent=self.rootkey,
                                     name=score_name)
      if score_value:
        score_key = tuple(score_value)
        score_deltas.setdefault(score_key, 0)
        score_deltas[score_key] += 1
        score_ent["value"] = score_value
        score_ents.append(score_ent)
      else:
        # Do we have to delete an old score entity?
        if score_name in old_scores:
          score_ents_del.append(old_scores[score_name])

    return (score_deltas, score_ents, score_ents_del)

  def __ComputeNodeModifications(self, score_deltas):
    """Computes modifications to ranker nodes.

    Given score deltas, computes which nodes need to be modified and by
    how much their child count has to be incremented / decremented.

    Args:
      score_deltas: A dict of scores to integers, as returned by
        _ComputeScoreDeltas.

    Returns:
      A dict of nodes (represented as node_key, child tuples) to integers.
      'result[(node_key, i)]' represents the amount that needs to be added to
      the i-th child of node node_key.
    """
    nodes_to_deltas = {}
    for score, delta in score_deltas.iteritems():
      for (node_id, child) in self.__FindNodeIDs(score):
        node = (self.__KeyFromNodeId(node_id), child)
        nodes_to_deltas[node] = nodes_to_deltas.get(node, 0) + delta
    return nodes_to_deltas

  def __FindRank(self, node_ids_with_children, nodes):
    """Utility function.  Finds the rank of a score.

    Args:
      node_ids_with_children: A list of node ids down to that score,
        paired with which child links to follow.
      nodes: A dict mapping node id to node entity.

    Returns:
      The score's rank.
    """
    tot = 0  # Counts the number of higher scores.
    for (node_id, child) in node_ids_with_children:
      if node_id in nodes:
        node = nodes[node_id]
        for i in xrange(child + 1, self.branching_factor):
          tot += node["child_counts"][i]
      else:
        # If the node isn't in the dict, the node simply doesn't exist.  We
        # are probably finding the rank for a score that doesn't appear in the
        # ranker, but that's perfectly fine.
        pass
    return tot

  def FindRank(self, score):
    """Finds the 0-based rank of a particular score; more precisely, returns the
    number of strictly higher scores stored.

    Args:
      score: The score whose rank we wish to find.

    Returns:
      The number of tracked scores that are higher.  Does not check whether
      anyone actually has the requested score.
    """
    return self.FindRanks([score])[0]

  def FindRanks(self, scores):
    """Finds the 0-based ranks of a number of particular scores.
    Like FindRank, but more efficient for multiple scores.

    Args:
      scores: A list of scores.

    Returns:
      A list of ranks.
    """
    for score in scores:
      assert len(score) * 2 == len(self.score_range)
    # Find the nodes we'll need to query to find information about these scores:
    node_ids_with_children_list = [self.__FindNodeIDs(score)
                                   for score in scores]
    node_ids = []
    for node_ids_with_children in node_ids_with_children_list:
      node_ids += [node_id for (node_id, _) in node_ids_with_children]
    # Query the needed nodes:
    nodes_dict = self.__GetMultipleNodes(node_ids)
    # Call __FindRank, which does the math, for each score:
    return [self.__FindRank(node_ids_with_children, nodes_dict) for
            node_ids_with_children in node_ids_with_children_list]

  def __FindScore(self, node_id, rank, score_range, approximate):
    """To be run in a transaction.  Finds the score ranked 'rank' in the subtree
    defined by node 'nodekey.'

    Args:
      node_id: The id of the node whose subtree we wish to find the
        score of rank 'rank' in.
      rank: The rank (within this subtree) of the score we wish to find.
      score_range: The score range for this particular node, as a list.
        Derivable from the node's node_id, but included for convenience.
      approximate: Do we have to return an approximate result, or an exact one?
        See the docstrings for FindScore and FindScoreApproximate.

    Returns:
      A tuple, (score, rank_of_tie), indicating the score's rank within
      node_id's subtree.  The way it indicates rank is defined in the dosctrings
      of FindScore and FindScoreApproximate, depending on the value of
      'approximate'.
    """
    # If we're approximating and thus allowed to do so, early-out if we just
    # need to return the highest available score.
    if approximate and rank == 0:
      return ([score - 1 for score in score_range[1::2]], 0)
    # Find the current node.
    node = datastore.Get(self.__KeyFromNodeId(node_id))
    child_counts = node["child_counts"]
    initial_rank = rank
    for i in xrange(self.branching_factor - 1, -1, -1):
      # If this child has enough scores that rank 'rank' is in
      # there, recurse.
      if rank - child_counts[i] < 0:
        child_score_range = self.__ChildScoreRange(score_range, i,
                                                   self.branching_factor)
        if self.__IsSingletonRange(child_score_range):
          # Base case; child_score_range refers to a single score. We don't
          # store leaf nodes so we can return right here.
          return (child_score_range[0::2], initial_rank - rank)
        # Not a base case.  Keep descending into children.
        ans = self.__FindScore(self.__ChildNodeId(node_id, i), rank,
                               child_score_range,
                               approximate)
        # Note the 'initial_rank - rank': we've asked the child for a score of
        # some rank among *its* children, so we have to add back in the scores
        # discarded on the way to that child.
        return (ans[0], ans[1] + (initial_rank - rank))
      else:
        rank -= child_counts[i]
    return None

  def __IsSingletonRange(self, scorerange):
    """Returns whether a range contains exactly one score."""
    return [score + 1 for score in scorerange[0::2]] == scorerange[1::2]

  @transactional
  def FindScore(self, rank):
    """Finds the score ranked at 'rank'.

    Args:
      rank: The rank of the score we wish to find.

    Returns:
      A tuple, (score, rank_of_tie).  'score' is the score ranked at 'rank',
      'rank_of_tie' is the rank of that score (which may be different from
      'rank' in the case of ties).
      e.g. if there are two scores tied at 5th and rank == 6, returns
      (score, 5).
    """
    return self.__FindScore(0, rank, self.score_range, False)

  @transactional
  def FindScoreApproximate(self, rank):
    """Finds a score that >= the score ranked at 'rank'.

    This method could be preferred to FindScore because it is more efficient.
    For example, if the objective is to find the top 50 scores of rank X or
    less, and those scores are stored in entities called scoreboard_row:
      score, rank = myrank.FindScoreApproximate(X)
      query = datastore.Query('scoreboard_row')
      query['score <='] = score
      result = query.Get(50 + X - rank)[X-rank:])  # Takes care of ties.

    Args:
      rank: The rank of the score we wish to find.

    Returns:
      A tuple, (score, rank_of_tie).
      If there is a tie at rank 'rank-1':
        rank's score <= score < rank-1's score, rank_of_tie == rank
      else:
        score == rank's score, rank_of_tie == the tied rank of everyone
        in the tie.
        e.g. if two scores are tied at 5th and rank == 6, returns (score, 5).
    """
    return self.__FindScore(0, rank, self.score_range, True)

  def TotalRankedScores(self):
    """Returns the total number of ranked scores.

    Returns:
      The total number of ranked scores.
    """
    root = datastore.Get([self.__KeyFromNodeId(0)])[0]
    if root:
      return sum(root["child_counts"])
    else:
      # Ranker doesn't have any ranked scores, yet
      return 0