py_tasks_melange: comparison app/ranklist/ranker.py

equal deleted inserted replaced

-:834ead6528a8
+:e7b7ea113a3b
+#!/usr/bin/python
+#
+# Copyright 2008 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from google.appengine.api import datastore
+from google.appengine.api import datastore_types
+from common import transactional
+class Ranker(object):
+"""A data structure for storing integer scores and quickly retrieving their
+relative ranks.
+Scores are stored as "name: score" mapping. A score is inserted by
+calling SetScore with a new name. The score can later be updated by
+calling SetScore again with the same name.
+The scores are actually lists of integers with the same number of elements,
+and their ordering is lexicographic.  That is to say that score A is higher
+than score B if they are different, and the first element that differs between
+the two is higher in A.  Thus [5, 3] is ranked higher than [4, 99].
+Example Use Case:
+Some number of people are participating in a programming contest.  Solving a
+problem gets points; contestants also get a tie-breaking "penalty time."  The
+higher the time, the worse the score.
+STEP 1:
+# Creates the ranker when the contest is created:
+rank = ranker.Ranker.Create([0, 10000, -999999, 1], 100)
+# In our contest, people won't have more than 9999 points or 999999 penalty
+# seconds.  Since penalty time is bad, we store [score, -penalty].
+contest['ranker'] = rank.rootkey
+datastore.Put(contest)
+STEP 2:
+# Someone registers for the contest.  The default score is [0, 0], and for
+# efficiency we don't put them in the ranker; they won't be ahead of anyone
+# anyway.
+STEP 3:
+# Player "Jon" gets points for the first time!
+rank = ranker.Ranker(contest['ranker'])
+# Loads up the ranker.  This is the first step of all the STEPs below.
+rank.SetScore("Jon", [5, -120])  # 5 points, 2 minutes
+STEP 4:
+# Player "Jon" gets points for the second time!
+rank.SetScore("Jon", [10, -300])  # 10 points, 5 minutes
+STEP 5:
+# What is the rank of the person with score [10, -300]?
+position = rank.FindRank([10, 300])
+# What are the ranks for the people with scores a, b, c?
+positions = rank.FindRanks([a, b, c])
+STEP 6:
+# What is the score of the person ranked 20th?
+score = rank.FindScore(19)
+# This is particularly useful for seeing multiple pages of ranked people.  If
+# the scores are separately tracked in an entity called 'scores', the
+# datastore can efficiently answer:
+# q = datastore.Query('scores')
+# q['score <='] = rank.FindScore(1000)[0]
+# next_twenty_scores = q.Get(20)
+# This is a simplified case, where scores are a single integer.
+STEP 7:
+# How many people have points?
+num_pointy_people = rank.TotalRankedScores()
+Implementation Details:
+The Ranker is a rooted tree of 'ranker_node' entities.  It is an
+N-ary tree, where N = self.branching_factor, which is specified by the
+constructor.  Each node represents some range of scores, and is assigned a
+unique node_id.
+Take for example a 3-ary tree with point range [0, 10000, -36000, 1].  This
+could represent a contest where contestants can have between 0 and 9999
+points, with ties being broken by (negative) penalty seconds that can be
+between 0 and 10 hours.
+The root represents the score range [0, 10000, -36000, 1].  It has node_id 0.
+Its first child has node_id 1 and score range [0, 3333, -36000, 1].
+The root's second child, node_id 2, has score range [3333, 6666, -36000, 1],
+and its third child, node_id 3, has score range [6666, 10000, -36000, 1].
+Node 1's first child, node_id 4, has range [0, 1111, -36000, 1], and so on.
+See __WhichChild for details of how children are assigned score ranges.
+The point of a node is to track how many stored scores are in the score range
+of each of its children.  The root in the above example would start off with
+child_node_counts = [0, 0, 0]; adding score [4000, 0] would change the root's
+child_node_counts to [0, 1, 0] and node 5's child_node_counts to [1, 0, 0],
+and so forth.
+Ranker also has a "ranker_score" entity for every score stored in the ranker.
+These entities are part of the same entity group as the ranker_node
+entities. This allows for atomic, idempotent calls to SetScores.
+Ranker supports the following operations, which can be read about in detail
+in their docstrings:
+SetScores(scores): Set scores for multiple players.
+FindRank(score): Finds the 0-based rank of the provided score.
+FindScore(rank): Finds the score with the provided 0-based rank.
+FindScoreApproximate(rank): Finds a score >= the score of the provided 0-based
+rank, < the score of rank-1 (unless rank and rank-1 are tied, in which case
+it returns their mutual score).
+TotalRankedScores: The total number of scores in the Ranker.
+See __FindNodeIDs for more notes on structure.
+"""
+def __init__(self, rootkey):
+"""Pulls a ranker out of the datastore, given the key of the root node.
+Args:
+rootkey: The datastore key of the ranker.
+"""
+# Get the root from the datastore:
+assert rootkey.kind() == "ranker"
+root = datastore.Get(rootkey)
+# Initialize some class variables:
+self.rootkey = rootkey
+self.score_range = root["score_range"]
+self.branching_factor = root["branching_factor"]
+# Sanity checking:
+assert len(self.score_range) > 1
+assert len(self.score_range) % 2 == 0
+for i in xrange(0, len(self.score_range), 2):
+assert self.score_range[i + 1] > self.score_range[i]
+assert self.branching_factor > 1
+@classmethod
+def Create(cls, score_range, branching_factor):
+"""Constructs a new Ranker and returns it.
+Args:
+score_range: A list showing the range of valid scores, in the form:
+[most_significant_score_min, most_significant_score_max,
+less_significant_score_min, less_significant_score_max, ...]
+Ranges are [inclusive, exclusive)
+branching_factor: The branching factor of the tree.  The number of
+datastore Gets is Theta(1/log(branching_factor)), and the amount of data
+returned by each Get is Theta(branching_factor).
+Returns:
+A new Ranker.
+"""
+# Put the root in the datastore:
+root = datastore.Entity("ranker")
+root["score_range"] = score_range
+root["branching_factor"] = branching_factor
+datastore.Put(root)
+myrank = Ranker(root.key())
+return myrank
+def __FindNodeIDs(self, score):
+"""Finds the nodes along the path from the root to a certain score.
+Args:
+score: The score we're finding the path for.
+Returns:
+A sorted list of (node_id, child) tuples, indicating that node_id is the
+node id of a node on the path, and child is which child of that node is
+next.  Note that the lowest child node (which would be a leaf node) does
+not actually exist, since all its relevant information (number of times
+that score was inserted) is stored in its parent.
+Nodes are numbered row-by-row: the root is 0, its children are in the range
+[1, self.branching_factor + 1), its grandchildren are in the range
+[self.branching_factor + 1,
+self.branching_factor**2 + self.branching_factor + 1), etc.
+Score ranges are lists of the form: [min_0, max_0, min_1, max_1, ...]
+A node representing a score range will be divided up by the first index
+where max_i != min_i + 1 (score ranges are [inclusive, exclusive)).
+Child x (0-indexed) of a node [a,b) will get the range:
+[a+x*(b-a)/branching_factor, a+(x+1)*(b-a)/branching_factor);
+Thus not all nodes will have nonzero ranges.  Nodes with zero range will
+never be visited, but they and their descendants will be counted in the node
+numbering scheme, so row x still has self.branching_factor**x nodes.
+"""
+nodes = []
+node = 0
+cur_range = list(self.score_range)
+# The current range of scores.  This will be narrowed as we move down the
+# tree; 'index' keeps track of the score type we're currently changing.
+for index in xrange(0, len(cur_range), 2):
+while cur_range[index + 1] - cur_range[index] > 1:
+# Subdivide cur_range[index]..cur_range[index + 1]
+which_child = self.__WhichChild(cur_range[index],
+cur_range[index + 1],
+score[index // 2],
+self.branching_factor)
+child = which_child[0]
+cur_range[index] = which_child[1][0]
+cur_range[index + 1] = which_child[1][1]
+assert 0 <= child < self.branching_factor
+nodes.append((node, child))
+node = self.__ChildNodeId(node, child)
+return nodes
+def __WhichChild(self, low, high, want, branching_factor):
+"""Determines which child of the range [low, high) 'want' belongs to.
+Args:
+low: An int, the low end of the range.
+high: An int, the high end of the range.
+want: An int, the score we're trying to determine a child for.
+branching_factor: The branching factor of the tree being used.
+Returns:
+A tuple, (child, [child's score range]).  Note that in general a score
+has multiple sub-scores, written in order of decreasing significance; this
+function divides up a single sub-score.
+Raises:
+An AssertionError if things go horribly wrong.
+"""
+assert low <= want < high
+# Need to find x such that (using integer division):
+#     x  *(high-low)/branching_factor <= want - low <
+#   (x+1)*(high-low)/branching_factor
+# Which is the least x such that (using integer division):
+#   want - low < (x+1)*(high-low)/branching_factor
+# Which is the ceiling of x such that (using floating point division):
+#   want - low + 1 == (x+1)*(high-low)/branching_factor
+#   x = -1 + math.ceil((want-low+1) * branching_factor / (high - low))
+# We get ceil by adding high - low - 1 to the numerator.
+x = -1 + (((want - low + 1) * branching_factor + high - low - 1) //
+(high - low))
+assert (x * (high - low) // branching_factor <=
+want - low < (x + 1) * (high - low) // branching_factor)
+return (x, self.__ChildScoreRange([low, high], x, branching_factor))
+def __ChildScoreRange(self, score_range, child, branching_factor):
+"""Calculates the score_range for a node's child.
+Args:
+score_range: A score range [min0, max0, min1, max1, ...]
+child: Which child of the node with score range score_range we're
+calculating the score range of.
+branching_factor: The branching factor of the tree in question.
+Returns:
+A score range [min0', max0', min1', max1', ...] for that child.
+"""
+for i in xrange(1, len(score_range), 2):
+if score_range[i] > score_range[i - 1] + 1:
+child_score_range = list(score_range)
+low, high = score_range[i - 1], score_range[i]
+child_score_range[i - 1], child_score_range[i] = (
+low + child * (high - low) // branching_factor,
+low + (child + 1) * (high - low) // branching_factor)
+return child_score_range
+raise AssertionError("Node with score range %s has no children." %
+score_range)
+def __ChildNodeId(self, node_id, child):
+"""Calculates the node id for a known node id's child.
+Args:
+node_id: The parent node's node_id
+child: Which child of the parent node we're finding the id for
+Returns:
+The node_id for the child'th child of node_id.
+"""
+return node_id * self.branching_factor + 1 + child
+def __GetMultipleNodes(self, node_ids):
+"""Gets multiple nodes from the datastore.
+Args:
+node_ids: A list of node ids we want to get.
+Returns:
+A dict of the nodes that were found, indexed by the node ids found
+in node_ids.
+"""
+if len(node_ids) == 0:
+return []
+node_ids = set(node_ids)
+keys = [self.__KeyFromNodeId(node_id) for node_id in node_ids]
+nodes = datastore.Get(keys)
+return dict((node_id, node) for (node_id, node) in zip(node_ids, nodes)
+if node)
+# Although, this method is currently not needed, we'll keep this
+# since we might need it and some point and it's an interesting
+# relationship
+def __ParentNode(self, node_id):
+"""Returns the node id of the parameter node id's parent.  Returns None if
+the parameter is 0."""
+if node_id == 0:
+return None
+return (node_id - 1) // self.branching_factor
+def __KeyFromNodeId(self, node_id):
+"""Creates a (named) key for the node with a given id.
+The key will have the ranker as a parent element to guarantee
+uniqueness (in the presence of multiple rankers) and to put all
+nodes in a single entity group.
+Args:
+node_id: The node's id as an integer.
+Returns:
+A (named) key for the node with the id 'node_id'.
+"""
+name = "node_%x" % node_id
+return datastore_types.Key.from_path("ranker_node", name,
+parent=self.rootkey)
+def __KeyForScore(self, name):
+"""Returns a (named) key for a ranker_score entity.
+Args:
+name: Name of the score to create a key for.
+Returns:
+A (named) key for the entity storing the score of 'name'.
+"""
+return datastore_types.Key.from_path("ranker_score", name,
+parent=self.rootkey)
+def __Increment(self, nodes_with_children, score_entities,
+score_entities_to_delete):
+"""Changes child counts for given nodes.
+This method will create nodes as needed.
+Args:
+nodes_with_children: A dict of (node_key, child) tuples to deltas
+score_entities: Additional score entities to persist as part of
+this transaction
+Returns:
+None
+"""
+keys = list(set(key for ((key, _), delta) in nodes_with_children.iteritems()
+if delta != 0))
+if not keys:
+return  # Nothing to do
+nodes = datastore.Get(keys)
+node_dict = {}
+for (key, node) in zip(keys, nodes):
+if not node:
+node = datastore.Entity("ranker_node", parent=self.rootkey,
+name=key.name())
+node["child_counts"] = [0] * self.branching_factor
+node_dict[key] = node
+for ((key, child), amount) in nodes_with_children.iteritems():
+if amount != 0:
+node = node_dict[key]
+node["child_counts"][child] += amount
+assert node["child_counts"][child] >= 0
+datastore.Put(node_dict.values() + score_entities)
+if score_entities_to_delete:
+datastore.Delete(score_entities_to_delete)
+def SetScore(self, name, score):
+"""Sets a single score.
+This is equivalent to calling 'SetScores({name: score})'
+Args:
+name: the name of the score as a string
+score: the score to set name to
+"""
+return self.SetScores({name: score})
+@transactional
+def SetScores(self, scores):
+"""Changes multiple scores atomically.
+Sets the scores of the named entities in scores to new values. For
+named entities that have not been registered with a score before,
+a new score is created. For named entities that already had a score,
+the score is changed to reflect the new score. If a score is None,
+the named entity's score will be removed from the ranker.
+Args:
+scores: A dict mapping entity names (strings) to scores (integer lists)
+"""
+score_deltas, score_ents, score_ents_del = self.__ComputeScoreDeltas(scores)
+node_ids_to_deltas = self.__ComputeNodeModifications(score_deltas)
+self.__Increment(node_ids_to_deltas, score_ents, score_ents_del)
+def __ComputeScoreDeltas(self, scores):
+"""Compute which scores have to be incremented and decremented.
+Args:
+scores: A dict mapping entity names to scores
+Returns:
+A tuple (score_deltas, score_entities, score_entities_to_delete).
+'score_deltas' is a dict, mapping scores (represented as tuples)
+to integers. 'score_deltas[s]' represents how many times the
+score 's' has to be incremented (or decremented).
+'score_entities' is a list of 'ranker_score' entities that have
+to be updated in the same transaction as modifying the ranker
+nodes. The entities already contain the updated score.
+Similarly, 'score_entities_to_delete' is a list of entities that
+have to be deleted in the same transaction as modifying the ranker
+nodes.
+"""
+score_keys = [self.__KeyForScore(score) for score in scores]
+old_scores = {}
+for old_score in datastore.Get(score_keys):
+if old_score:
+old_scores[old_score.key().name()] = old_score
+score_deltas = {}
+# Score entities to update
+score_ents = []
+score_ents_del = []
+for score_name, score_value in scores.iteritems():
+if score_name in old_scores:
+score_ent = old_scores[score_name]
+if score_ent["value"] == score_value:
+continue  # No change in score => nothing to do
+old_score_key = tuple(score_ent["value"])
+score_deltas.setdefault(old_score_key, 0)
+score_deltas[old_score_key] -= 1
+else:
+score_ent = datastore.Entity("ranker_score", parent=self.rootkey,
+name=score_name)
+if score_value:
+score_key = tuple(score_value)
+score_deltas.setdefault(score_key, 0)
+score_deltas[score_key] += 1
+score_ent["value"] = score_value
+score_ents.append(score_ent)
+else:
+# Do we have to delete an old score entity?
+if score_name in old_scores:
+score_ents_del.append(old_scores[score_name])
+return (score_deltas, score_ents, score_ents_del)
+def __ComputeNodeModifications(self, score_deltas):
+"""Computes modifications to ranker nodes.
+Given score deltas, computes which nodes need to be modified and by
+how much their child count has to be incremented / decremented.
+Args:
+score_deltas: A dict of scores to integers, as returned by
+_ComputeScoreDeltas.
+Returns:
+A dict of nodes (represented as node_key, child tuples) to integers.
+'result[(node_key, i)]' represents the amount that needs to be added to
+the i-th child of node node_key.
+"""
+nodes_to_deltas = {}
+for score, delta in score_deltas.iteritems():
+for (node_id, child) in self.__FindNodeIDs(score):
+node = (self.__KeyFromNodeId(node_id), child)
+nodes_to_deltas[node] = nodes_to_deltas.get(node, 0) + delta
+return nodes_to_deltas
+def __FindRank(self, node_ids_with_children, nodes):
+"""Utility function.  Finds the rank of a score.
+Args:
+node_ids_with_children: A list of node ids down to that score,
+paired with which child links to follow.
+nodes: A dict mapping node id to node entity.
+Returns:
+The score's rank.
+"""
+tot = 0  # Counts the number of higher scores.
+for (node_id, child) in node_ids_with_children:
+if node_id in nodes:
+node = nodes[node_id]
+for i in xrange(child + 1, self.branching_factor):
+tot += node["child_counts"][i]
+else:
+# If the node isn't in the dict, the node simply doesn't exist.  We
+# are probably finding the rank for a score that doesn't appear in the
+# ranker, but that's perfectly fine.
+pass
+return tot
+def FindRank(self, score):
+"""Finds the 0-based rank of a particular score; more precisely, returns the
+number of strictly higher scores stored.
+Args:
+score: The score whose rank we wish to find.
+Returns:
+The number of tracked scores that are higher.  Does not check whether
+anyone actually has the requested score.
+"""
+return self.FindRanks([score])[0]
+def FindRanks(self, scores):
+"""Finds the 0-based ranks of a number of particular scores.
+Like FindRank, but more efficient for multiple scores.
+Args:
+scores: A list of scores.
+Returns:
+A list of ranks.
+"""
+for score in scores:
+assert len(score) * 2 == len(self.score_range)
+# Find the nodes we'll need to query to find information about these scores:
+node_ids_with_children_list = [self.__FindNodeIDs(score)
+for score in scores]
+node_ids = []
+for node_ids_with_children in node_ids_with_children_list:
+node_ids += [node_id for (node_id, _) in node_ids_with_children]
+# Query the needed nodes:
+nodes_dict = self.__GetMultipleNodes(node_ids)
+# Call __FindRank, which does the math, for each score:
+return [self.__FindRank(node_ids_with_children, nodes_dict) for
+node_ids_with_children in node_ids_with_children_list]
+def __FindScore(self, node_id, rank, score_range, approximate):
+"""To be run in a transaction.  Finds the score ranked 'rank' in the subtree
+defined by node 'nodekey.'
+Args:
+node_id: The id of the node whose subtree we wish to find the
+score of rank 'rank' in.
+rank: The rank (within this subtree) of the score we wish to find.
+score_range: The score range for this particular node, as a list.
+Derivable from the node's node_id, but included for convenience.
+approximate: Do we have to return an approximate result, or an exact one?
+See the docstrings for FindScore and FindScoreApproximate.
+Returns:
+A tuple, (score, rank_of_tie), indicating the score's rank within
+node_id's subtree.  The way it indicates rank is defined in the dosctrings
+of FindScore and FindScoreApproximate, depending on the value of
+'approximate'.
+"""
+# If we're approximating and thus allowed to do so, early-out if we just
+# need to return the highest available score.
+if approximate and rank == 0:
+return ([score - 1 for score in score_range[1::2]], 0)
+# Find the current node.
+node = datastore.Get(self.__KeyFromNodeId(node_id))
+child_counts = node["child_counts"]
+initial_rank = rank
+for i in xrange(self.branching_factor - 1, -1, -1):
+# If this child has enough scores that rank 'rank' is in
+# there, recurse.
+if rank - child_counts[i] < 0:
+child_score_range = self.__ChildScoreRange(score_range, i,
+self.branching_factor)
+if self.__IsSingletonRange(child_score_range):
+# Base case; child_score_range refers to a single score. We don't
+# store leaf nodes so we can return right here.
+return (child_score_range[0::2], initial_rank - rank)
+# Not a base case.  Keep descending into children.
+ans = self.__FindScore(self.__ChildNodeId(node_id, i), rank,
+child_score_range,
+approximate)
+# Note the 'initial_rank - rank': we've asked the child for a score of
+# some rank among *its* children, so we have to add back in the scores
+# discarded on the way to that child.
+return (ans[0], ans[1] + (initial_rank - rank))
+else:
+rank -= child_counts[i]
+return None
+def __IsSingletonRange(self, scorerange):
+"""Returns whether a range contains exactly one score."""
+return [score + 1 for score in scorerange[0::2]] == scorerange[1::2]
+@transactional
+def FindScore(self, rank):
+"""Finds the score ranked at 'rank'.
+Args:
+rank: The rank of the score we wish to find.
+Returns:
+A tuple, (score, rank_of_tie).  'score' is the score ranked at 'rank',
+'rank_of_tie' is the rank of that score (which may be different from
+'rank' in the case of ties).
+e.g. if there are two scores tied at 5th and rank == 6, returns
+(score, 5).
+"""
+return self.__FindScore(0, rank, self.score_range, False)
+@transactional
+def FindScoreApproximate(self, rank):
+"""Finds a score that >= the score ranked at 'rank'.
+This method could be preferred to FindScore because it is more efficient.
+For example, if the objective is to find the top 50 scores of rank X or
+less, and those scores are stored in entities called scoreboard_row:
+score, rank = myrank.FindScoreApproximate(X)
+query = datastore.Query('scoreboard_row')
+query['score <='] = score
+result = query.Get(50 + X - rank)[X-rank:])  # Takes care of ties.
+Args:
+rank: The rank of the score we wish to find.
+Returns:
+A tuple, (score, rank_of_tie).
+If there is a tie at rank 'rank-1':
+rank's score <= score < rank-1's score, rank_of_tie == rank
+else:
+score == rank's score, rank_of_tie == the tied rank of everyone
+in the tie.
+e.g. if two scores are tied at 5th and rank == 6, returns (score, 5).
+"""
+return self.__FindScore(0, rank, self.score_range, True)
+def TotalRankedScores(self):
+"""Returns the total number of ranked scores.
+Returns:
+The total number of ranked scores.
+"""
+root = datastore.Get([self.__KeyFromNodeId(0)])[0]
+if root:
+return sum(root["child_counts"])
+else:
+# Ranker doesn't have any ranked scores, yet
+return 0