app/ranklist/ranker.py
changeset 1597 e7b7ea113a3b
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/app/ranklist/ranker.py	Mon Mar 02 22:57:40 2009 +0000
@@ -0,0 +1,655 @@
+#!/usr/bin/python
+#
+# Copyright 2008 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from google.appengine.api import datastore
+from google.appengine.api import datastore_types
+
+from common import transactional
+
+
+class Ranker(object):
+  """A data structure for storing integer scores and quickly retrieving their
+  relative ranks.
+
+  Scores are stored as "name: score" mapping. A score is inserted by
+  calling SetScore with a new name. The score can later be updated by
+  calling SetScore again with the same name.
+
+  The scores are actually lists of integers with the same number of elements,
+  and their ordering is lexicographic.  That is to say that score A is higher
+  than score B if they are different, and the first element that differs between
+  the two is higher in A.  Thus [5, 3] is ranked higher than [4, 99].
+
+  Example Use Case:
+
+  Some number of people are participating in a programming contest.  Solving a
+  problem gets points; contestants also get a tie-breaking "penalty time."  The
+  higher the time, the worse the score.
+
+  STEP 1:
+  # Creates the ranker when the contest is created:
+  rank = ranker.Ranker.Create([0, 10000, -999999, 1], 100)
+  # In our contest, people won't have more than 9999 points or 999999 penalty
+  # seconds.  Since penalty time is bad, we store [score, -penalty].
+  contest['ranker'] = rank.rootkey
+  datastore.Put(contest)
+
+  STEP 2:
+  # Someone registers for the contest.  The default score is [0, 0], and for
+  # efficiency we don't put them in the ranker; they won't be ahead of anyone
+  # anyway.
+
+  STEP 3:
+  # Player "Jon" gets points for the first time!
+  rank = ranker.Ranker(contest['ranker'])
+  # Loads up the ranker.  This is the first step of all the STEPs below.
+  rank.SetScore("Jon", [5, -120])  # 5 points, 2 minutes
+
+  STEP 4:
+  # Player "Jon" gets points for the second time!
+  rank.SetScore("Jon", [10, -300])  # 10 points, 5 minutes
+
+  STEP 5:
+  # What is the rank of the person with score [10, -300]?
+  position = rank.FindRank([10, 300])
+  # What are the ranks for the people with scores a, b, c?
+  positions = rank.FindRanks([a, b, c])
+
+  STEP 6:
+  # What is the score of the person ranked 20th?
+  score = rank.FindScore(19)
+  # This is particularly useful for seeing multiple pages of ranked people.  If
+  # the scores are separately tracked in an entity called 'scores', the
+  # datastore can efficiently answer:
+  # q = datastore.Query('scores')
+  # q['score <='] = rank.FindScore(1000)[0]
+  # next_twenty_scores = q.Get(20)
+  # This is a simplified case, where scores are a single integer.
+
+  STEP 7:
+  # How many people have points?
+  num_pointy_people = rank.TotalRankedScores()
+
+
+
+  Implementation Details:
+
+  The Ranker is a rooted tree of 'ranker_node' entities.  It is an
+  N-ary tree, where N = self.branching_factor, which is specified by the
+  constructor.  Each node represents some range of scores, and is assigned a
+  unique node_id.
+
+  Take for example a 3-ary tree with point range [0, 10000, -36000, 1].  This
+  could represent a contest where contestants can have between 0 and 9999
+  points, with ties being broken by (negative) penalty seconds that can be
+  between 0 and 10 hours.
+
+  The root represents the score range [0, 10000, -36000, 1].  It has node_id 0.
+  Its first child has node_id 1 and score range [0, 3333, -36000, 1].
+  The root's second child, node_id 2, has score range [3333, 6666, -36000, 1],
+  and its third child, node_id 3, has score range [6666, 10000, -36000, 1].
+  Node 1's first child, node_id 4, has range [0, 1111, -36000, 1], and so on.
+  See __WhichChild for details of how children are assigned score ranges.
+
+  The point of a node is to track how many stored scores are in the score range
+  of each of its children.  The root in the above example would start off with
+  child_node_counts = [0, 0, 0]; adding score [4000, 0] would change the root's
+  child_node_counts to [0, 1, 0] and node 5's child_node_counts to [1, 0, 0],
+  and so forth.
+
+  Ranker also has a "ranker_score" entity for every score stored in the ranker.
+  These entities are part of the same entity group as the ranker_node
+  entities. This allows for atomic, idempotent calls to SetScores.
+
+  Ranker supports the following operations, which can be read about in detail
+  in their docstrings:
+
+  SetScores(scores): Set scores for multiple players.
+  FindRank(score): Finds the 0-based rank of the provided score.
+  FindScore(rank): Finds the score with the provided 0-based rank.
+  FindScoreApproximate(rank): Finds a score >= the score of the provided 0-based
+    rank, < the score of rank-1 (unless rank and rank-1 are tied, in which case
+    it returns their mutual score).
+  TotalRankedScores: The total number of scores in the Ranker.
+
+  See __FindNodeIDs for more notes on structure.
+
+  """
+
+  def __init__(self, rootkey):
+    """Pulls a ranker out of the datastore, given the key of the root node.
+
+    Args:
+      rootkey: The datastore key of the ranker.
+    """
+    # Get the root from the datastore:
+    assert rootkey.kind() == "ranker"
+    root = datastore.Get(rootkey)
+    # Initialize some class variables:
+    self.rootkey = rootkey
+    self.score_range = root["score_range"]
+    self.branching_factor = root["branching_factor"]
+    # Sanity checking:
+    assert len(self.score_range) > 1
+    assert len(self.score_range) % 2 == 0
+    for i in xrange(0, len(self.score_range), 2):
+      assert self.score_range[i + 1] > self.score_range[i]
+    assert self.branching_factor > 1
+
+  @classmethod
+  def Create(cls, score_range, branching_factor):
+    """Constructs a new Ranker and returns it.
+
+    Args:
+      score_range: A list showing the range of valid scores, in the form:
+        [most_significant_score_min, most_significant_score_max,
+         less_significant_score_min, less_significant_score_max, ...]
+        Ranges are [inclusive, exclusive)
+      branching_factor: The branching factor of the tree.  The number of
+        datastore Gets is Theta(1/log(branching_factor)), and the amount of data
+        returned by each Get is Theta(branching_factor).
+
+    Returns:
+      A new Ranker.
+    """
+    # Put the root in the datastore:
+    root = datastore.Entity("ranker")
+    root["score_range"] = score_range
+    root["branching_factor"] = branching_factor
+    datastore.Put(root)
+    myrank = Ranker(root.key())
+    return myrank
+
+  def __FindNodeIDs(self, score):
+    """Finds the nodes along the path from the root to a certain score.
+
+    Args:
+      score: The score we're finding the path for.
+
+    Returns:
+      A sorted list of (node_id, child) tuples, indicating that node_id is the
+      node id of a node on the path, and child is which child of that node is
+      next.  Note that the lowest child node (which would be a leaf node) does
+      not actually exist, since all its relevant information (number of times
+      that score was inserted) is stored in its parent.
+
+    Nodes are numbered row-by-row: the root is 0, its children are in the range
+    [1, self.branching_factor + 1), its grandchildren are in the range
+    [self.branching_factor + 1,
+     self.branching_factor**2 + self.branching_factor + 1), etc.
+
+    Score ranges are lists of the form: [min_0, max_0, min_1, max_1, ...]
+    A node representing a score range will be divided up by the first index
+    where max_i != min_i + 1 (score ranges are [inclusive, exclusive)).
+
+    Child x (0-indexed) of a node [a,b) will get the range:
+    [a+x*(b-a)/branching_factor, a+(x+1)*(b-a)/branching_factor);
+    Thus not all nodes will have nonzero ranges.  Nodes with zero range will
+    never be visited, but they and their descendants will be counted in the node
+    numbering scheme, so row x still has self.branching_factor**x nodes.
+    """
+    nodes = []
+    node = 0
+    cur_range = list(self.score_range)
+    # The current range of scores.  This will be narrowed as we move down the
+    # tree; 'index' keeps track of the score type we're currently changing.
+    for index in xrange(0, len(cur_range), 2):
+      while cur_range[index + 1] - cur_range[index] > 1:
+        # Subdivide cur_range[index]..cur_range[index + 1]
+        which_child = self.__WhichChild(cur_range[index],
+                                        cur_range[index + 1],
+                                        score[index // 2],
+                                        self.branching_factor)
+        child = which_child[0]
+        cur_range[index] = which_child[1][0]
+        cur_range[index + 1] = which_child[1][1]
+        assert 0 <= child < self.branching_factor
+        nodes.append((node, child))
+        node = self.__ChildNodeId(node, child)
+    return nodes
+
+  def __WhichChild(self, low, high, want, branching_factor):
+    """Determines which child of the range [low, high) 'want' belongs to.
+
+    Args:
+      low: An int, the low end of the range.
+      high: An int, the high end of the range.
+      want: An int, the score we're trying to determine a child for.
+      branching_factor: The branching factor of the tree being used.
+
+    Returns:
+      A tuple, (child, [child's score range]).  Note that in general a score
+      has multiple sub-scores, written in order of decreasing significance; this
+      function divides up a single sub-score.
+
+    Raises:
+      An AssertionError if things go horribly wrong.
+    """
+    assert low <= want < high
+    # Need to find x such that (using integer division):
+    #     x  *(high-low)/branching_factor <= want - low <
+    #   (x+1)*(high-low)/branching_factor
+    # Which is the least x such that (using integer division):
+    #   want - low < (x+1)*(high-low)/branching_factor
+    # Which is the ceiling of x such that (using floating point division):
+    #   want - low + 1 == (x+1)*(high-low)/branching_factor
+    #   x = -1 + math.ceil((want-low+1) * branching_factor / (high - low))
+    # We get ceil by adding high - low - 1 to the numerator.
+    x = -1 + (((want - low + 1) * branching_factor + high - low - 1) //
+              (high - low))
+    assert (x * (high - low) // branching_factor <=
+            want - low < (x + 1) * (high - low) // branching_factor)
+    return (x, self.__ChildScoreRange([low, high], x, branching_factor))
+
+  def __ChildScoreRange(self, score_range, child, branching_factor):
+    """Calculates the score_range for a node's child.
+
+    Args:
+      score_range: A score range [min0, max0, min1, max1, ...]
+      child: Which child of the node with score range score_range we're
+        calculating the score range of.
+      branching_factor: The branching factor of the tree in question.
+
+    Returns:
+      A score range [min0', max0', min1', max1', ...] for that child.
+    """
+    for i in xrange(1, len(score_range), 2):
+      if score_range[i] > score_range[i - 1] + 1:
+        child_score_range = list(score_range)
+        low, high = score_range[i - 1], score_range[i]
+        child_score_range[i - 1], child_score_range[i] = (
+            low + child * (high - low) // branching_factor,
+            low + (child + 1) * (high - low) // branching_factor)
+        return child_score_range
+    raise AssertionError("Node with score range %s has no children." %
+                         score_range)
+
+  def __ChildNodeId(self, node_id, child):
+    """Calculates the node id for a known node id's child.
+
+    Args:
+      node_id: The parent node's node_id
+      child: Which child of the parent node we're finding the id for
+
+    Returns:
+      The node_id for the child'th child of node_id.
+    """
+    return node_id * self.branching_factor + 1 + child
+
+  def __GetMultipleNodes(self, node_ids):
+    """Gets multiple nodes from the datastore.
+
+    Args:
+      node_ids: A list of node ids we want to get.
+
+    Returns:
+      A dict of the nodes that were found, indexed by the node ids found
+      in node_ids.
+    """
+    if len(node_ids) == 0:
+      return []
+    node_ids = set(node_ids)
+    keys = [self.__KeyFromNodeId(node_id) for node_id in node_ids]
+    nodes = datastore.Get(keys)
+    return dict((node_id, node) for (node_id, node) in zip(node_ids, nodes)
+                if node)
+
+  # Although, this method is currently not needed, we'll keep this
+  # since we might need it and some point and it's an interesting
+  # relationship
+  def __ParentNode(self, node_id):
+    """Returns the node id of the parameter node id's parent.  Returns None if
+    the parameter is 0."""
+    if node_id == 0:
+      return None
+    return (node_id - 1) // self.branching_factor
+
+  def __KeyFromNodeId(self, node_id):
+    """Creates a (named) key for the node with a given id.
+
+    The key will have the ranker as a parent element to guarantee
+    uniqueness (in the presence of multiple rankers) and to put all
+    nodes in a single entity group.
+
+    Args:
+      node_id: The node's id as an integer.
+
+    Returns:
+      A (named) key for the node with the id 'node_id'.
+    """
+    name = "node_%x" % node_id
+    return datastore_types.Key.from_path("ranker_node", name,
+                                         parent=self.rootkey)
+
+  def __KeyForScore(self, name):
+    """Returns a (named) key for a ranker_score entity.
+
+    Args:
+      name: Name of the score to create a key for.
+
+    Returns:
+      A (named) key for the entity storing the score of 'name'.
+    """
+    return datastore_types.Key.from_path("ranker_score", name,
+                                         parent=self.rootkey)
+
+  def __Increment(self, nodes_with_children, score_entities,
+                  score_entities_to_delete):
+    """Changes child counts for given nodes.
+
+    This method will create nodes as needed.
+
+    Args:
+      nodes_with_children: A dict of (node_key, child) tuples to deltas
+      score_entities: Additional score entities to persist as part of
+        this transaction
+    Returns:
+      None
+    """
+    keys = list(set(key for ((key, _), delta) in nodes_with_children.iteritems()
+                    if delta != 0))
+    if not keys:
+      return  # Nothing to do
+    nodes = datastore.Get(keys)
+
+    node_dict = {}
+    for (key, node) in zip(keys, nodes):
+      if not node:
+        node = datastore.Entity("ranker_node", parent=self.rootkey,
+                                name=key.name())
+        node["child_counts"] = [0] * self.branching_factor
+      node_dict[key] = node
+    for ((key, child), amount) in nodes_with_children.iteritems():
+      if amount != 0:
+        node = node_dict[key]
+        node["child_counts"][child] += amount
+        assert node["child_counts"][child] >= 0
+    datastore.Put(node_dict.values() + score_entities)
+    if score_entities_to_delete:
+      datastore.Delete(score_entities_to_delete)
+
+  def SetScore(self, name, score):
+    """Sets a single score.
+
+    This is equivalent to calling 'SetScores({name: score})'
+
+    Args:
+      name: the name of the score as a string
+      score: the score to set name to
+    """
+    return self.SetScores({name: score})
+
+  @transactional
+  def SetScores(self, scores):
+    """Changes multiple scores atomically.
+
+    Sets the scores of the named entities in scores to new values. For
+    named entities that have not been registered with a score before,
+    a new score is created. For named entities that already had a score,
+    the score is changed to reflect the new score. If a score is None,
+    the named entity's score will be removed from the ranker.
+
+    Args:
+      scores: A dict mapping entity names (strings) to scores (integer lists)
+    """
+    score_deltas, score_ents, score_ents_del = self.__ComputeScoreDeltas(scores)
+    node_ids_to_deltas = self.__ComputeNodeModifications(score_deltas)
+    self.__Increment(node_ids_to_deltas, score_ents, score_ents_del)
+
+  def __ComputeScoreDeltas(self, scores):
+    """Compute which scores have to be incremented and decremented.
+
+    Args:
+      scores: A dict mapping entity names to scores
+
+    Returns:
+      A tuple (score_deltas, score_entities, score_entities_to_delete).
+
+      'score_deltas' is a dict, mapping scores (represented as tuples)
+      to integers. 'score_deltas[s]' represents how many times the
+      score 's' has to be incremented (or decremented).
+
+      'score_entities' is a list of 'ranker_score' entities that have
+      to be updated in the same transaction as modifying the ranker
+      nodes. The entities already contain the updated score.
+
+      Similarly, 'score_entities_to_delete' is a list of entities that
+      have to be deleted in the same transaction as modifying the ranker
+      nodes.
+    """
+    score_keys = [self.__KeyForScore(score) for score in scores]
+    old_scores = {}
+    for old_score in datastore.Get(score_keys):
+      if old_score:
+        old_scores[old_score.key().name()] = old_score
+    score_deltas = {}
+    # Score entities to update
+    score_ents = []
+    score_ents_del = []
+    for score_name, score_value in scores.iteritems():
+      if score_name in old_scores:
+        score_ent = old_scores[score_name]
+        if score_ent["value"] == score_value:
+          continue  # No change in score => nothing to do
+        old_score_key = tuple(score_ent["value"])
+        score_deltas.setdefault(old_score_key, 0)
+        score_deltas[old_score_key] -= 1
+      else:
+        score_ent = datastore.Entity("ranker_score", parent=self.rootkey,
+                                     name=score_name)
+      if score_value:
+        score_key = tuple(score_value)
+        score_deltas.setdefault(score_key, 0)
+        score_deltas[score_key] += 1
+        score_ent["value"] = score_value
+        score_ents.append(score_ent)
+      else:
+        # Do we have to delete an old score entity?
+        if score_name in old_scores:
+          score_ents_del.append(old_scores[score_name])
+
+    return (score_deltas, score_ents, score_ents_del)
+
+  def __ComputeNodeModifications(self, score_deltas):
+    """Computes modifications to ranker nodes.
+
+    Given score deltas, computes which nodes need to be modified and by
+    how much their child count has to be incremented / decremented.
+
+    Args:
+      score_deltas: A dict of scores to integers, as returned by
+        _ComputeScoreDeltas.
+
+    Returns:
+      A dict of nodes (represented as node_key, child tuples) to integers.
+      'result[(node_key, i)]' represents the amount that needs to be added to
+      the i-th child of node node_key.
+    """
+    nodes_to_deltas = {}
+    for score, delta in score_deltas.iteritems():
+      for (node_id, child) in self.__FindNodeIDs(score):
+        node = (self.__KeyFromNodeId(node_id), child)
+        nodes_to_deltas[node] = nodes_to_deltas.get(node, 0) + delta
+    return nodes_to_deltas
+
+  def __FindRank(self, node_ids_with_children, nodes):
+    """Utility function.  Finds the rank of a score.
+
+    Args:
+      node_ids_with_children: A list of node ids down to that score,
+        paired with which child links to follow.
+      nodes: A dict mapping node id to node entity.
+
+    Returns:
+      The score's rank.
+    """
+    tot = 0  # Counts the number of higher scores.
+    for (node_id, child) in node_ids_with_children:
+      if node_id in nodes:
+        node = nodes[node_id]
+        for i in xrange(child + 1, self.branching_factor):
+          tot += node["child_counts"][i]
+      else:
+        # If the node isn't in the dict, the node simply doesn't exist.  We
+        # are probably finding the rank for a score that doesn't appear in the
+        # ranker, but that's perfectly fine.
+        pass
+    return tot
+
+  def FindRank(self, score):
+    """Finds the 0-based rank of a particular score; more precisely, returns the
+    number of strictly higher scores stored.
+
+    Args:
+      score: The score whose rank we wish to find.
+
+    Returns:
+      The number of tracked scores that are higher.  Does not check whether
+      anyone actually has the requested score.
+    """
+    return self.FindRanks([score])[0]
+
+  def FindRanks(self, scores):
+    """Finds the 0-based ranks of a number of particular scores.
+    Like FindRank, but more efficient for multiple scores.
+
+    Args:
+      scores: A list of scores.
+
+    Returns:
+      A list of ranks.
+    """
+    for score in scores:
+      assert len(score) * 2 == len(self.score_range)
+    # Find the nodes we'll need to query to find information about these scores:
+    node_ids_with_children_list = [self.__FindNodeIDs(score)
+                                   for score in scores]
+    node_ids = []
+    for node_ids_with_children in node_ids_with_children_list:
+      node_ids += [node_id for (node_id, _) in node_ids_with_children]
+    # Query the needed nodes:
+    nodes_dict = self.__GetMultipleNodes(node_ids)
+    # Call __FindRank, which does the math, for each score:
+    return [self.__FindRank(node_ids_with_children, nodes_dict) for
+            node_ids_with_children in node_ids_with_children_list]
+
+  def __FindScore(self, node_id, rank, score_range, approximate):
+    """To be run in a transaction.  Finds the score ranked 'rank' in the subtree
+    defined by node 'nodekey.'
+
+    Args:
+      node_id: The id of the node whose subtree we wish to find the
+        score of rank 'rank' in.
+      rank: The rank (within this subtree) of the score we wish to find.
+      score_range: The score range for this particular node, as a list.
+        Derivable from the node's node_id, but included for convenience.
+      approximate: Do we have to return an approximate result, or an exact one?
+        See the docstrings for FindScore and FindScoreApproximate.
+
+    Returns:
+      A tuple, (score, rank_of_tie), indicating the score's rank within
+      node_id's subtree.  The way it indicates rank is defined in the dosctrings
+      of FindScore and FindScoreApproximate, depending on the value of
+      'approximate'.
+    """
+    # If we're approximating and thus allowed to do so, early-out if we just
+    # need to return the highest available score.
+    if approximate and rank == 0:
+      return ([score - 1 for score in score_range[1::2]], 0)
+    # Find the current node.
+    node = datastore.Get(self.__KeyFromNodeId(node_id))
+    child_counts = node["child_counts"]
+    initial_rank = rank
+    for i in xrange(self.branching_factor - 1, -1, -1):
+      # If this child has enough scores that rank 'rank' is in
+      # there, recurse.
+      if rank - child_counts[i] < 0:
+        child_score_range = self.__ChildScoreRange(score_range, i,
+                                                   self.branching_factor)
+        if self.__IsSingletonRange(child_score_range):
+          # Base case; child_score_range refers to a single score. We don't
+          # store leaf nodes so we can return right here.
+          return (child_score_range[0::2], initial_rank - rank)
+        # Not a base case.  Keep descending into children.
+        ans = self.__FindScore(self.__ChildNodeId(node_id, i), rank,
+                               child_score_range,
+                               approximate)
+        # Note the 'initial_rank - rank': we've asked the child for a score of
+        # some rank among *its* children, so we have to add back in the scores
+        # discarded on the way to that child.
+        return (ans[0], ans[1] + (initial_rank - rank))
+      else:
+        rank -= child_counts[i]
+    return None
+
+  def __IsSingletonRange(self, scorerange):
+    """Returns whether a range contains exactly one score."""
+    return [score + 1 for score in scorerange[0::2]] == scorerange[1::2]
+
+  @transactional
+  def FindScore(self, rank):
+    """Finds the score ranked at 'rank'.
+
+    Args:
+      rank: The rank of the score we wish to find.
+
+    Returns:
+      A tuple, (score, rank_of_tie).  'score' is the score ranked at 'rank',
+      'rank_of_tie' is the rank of that score (which may be different from
+      'rank' in the case of ties).
+      e.g. if there are two scores tied at 5th and rank == 6, returns
+      (score, 5).
+    """
+    return self.__FindScore(0, rank, self.score_range, False)
+
+  @transactional
+  def FindScoreApproximate(self, rank):
+    """Finds a score that >= the score ranked at 'rank'.
+
+    This method could be preferred to FindScore because it is more efficient.
+    For example, if the objective is to find the top 50 scores of rank X or
+    less, and those scores are stored in entities called scoreboard_row:
+      score, rank = myrank.FindScoreApproximate(X)
+      query = datastore.Query('scoreboard_row')
+      query['score <='] = score
+      result = query.Get(50 + X - rank)[X-rank:])  # Takes care of ties.
+
+    Args:
+      rank: The rank of the score we wish to find.
+
+    Returns:
+      A tuple, (score, rank_of_tie).
+      If there is a tie at rank 'rank-1':
+        rank's score <= score < rank-1's score, rank_of_tie == rank
+      else:
+        score == rank's score, rank_of_tie == the tied rank of everyone
+        in the tie.
+        e.g. if two scores are tied at 5th and rank == 6, returns (score, 5).
+    """
+    return self.__FindScore(0, rank, self.score_range, True)
+
+  def TotalRankedScores(self):
+    """Returns the total number of ranked scores.
+
+    Returns:
+      The total number of ranked scores.
+    """
+    root = datastore.Get([self.__KeyFromNodeId(0)])[0]
+    if root:
+      return sum(root["child_counts"])
+    else:
+      # Ranker doesn't have any ranked scores, yet
+      return 0