py_tasks_melange: comparison thirdparty/google_appengine/google/appengine/tools/bulkloader.py

equal deleted inserted replaced

-:27971a13089f
+:2e0b0af889be
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 """Imports data over HTTP.
 Usage:
 %(arg0)s [flags]
 restricted to this rate. (Default 250000)
 --batch_size=<int>      Number of Entity objects to include in each post to
 the URL endpoint. The more data per row/Entity, the
 smaller the batch size should be. (Default 10)
 --config_file=<path>    File containing Model and Loader definitions.
-(Required)
+(Required unless --dump or --restore are used)
 --db_filename=<path>    Specific progress database to write to, or to
 resume from. If not supplied, then a new database
 will be started, named:
 bulkloader-progress-TIMESTAMP.
 The special filename "skip" may be used to simply
 skip reading/writing any progress information.
 --download              Export entities to a file.
+--dry_run               Do not execute any remote_api calls.
+--dump                  Use zero-configuration dump format.
 --email=<string>        The username to use. Will prompt if omitted.
 --exporter_opts=<string>
 A string to pass to the Exporter.initialize method.
 --filename=<path>       Path to the file to import. (Required)
 --has_header            Skip the first row of the input.
 datastore. (Required)
 --loader_opts=<string>  A string to pass to the Loader.initialize method.
 --log_file=<path>       File to write bulkloader logs.  If not supplied
 then a new log file will be created, named:
 bulkloader-log-TIMESTAMP.
+--map                   Map an action across datastore entities.
+--mapper_opts=<string>  A string to pass to the Mapper.Initialize method.
 --num_threads=<int>     Number of threads to use for uploading entities
 (Default 10)
 --passin                Read the login password from stdin.
+--restore               Restore from zero-configuration dump format.
 --result_db_filename=<path>
 Result database to write to for downloads.
 --rps_limit=<int>       The maximum number of records per second to
 transfer to the server. (Default: 20)
 --url=<string>          URL endpoint to post to for importing data.
 """
-import cPickle
 import csv
 import errno
 import getopt
 import getpass
 import imp
 import logging
 import os
 import Queue
 import re
+import shutil
 import signal
 import StringIO
 import sys
 import threading
 import time
+import traceback
 import urllib2
 import urlparse
+from google.appengine.datastore import entity_pb
+from google.appengine.api import apiproxy_stub_map
+from google.appengine.api import datastore
 from google.appengine.api import datastore_errors
+from google.appengine.datastore import datastore_pb
 from google.appengine.ext import db
+from google.appengine.ext import key_range as key_range_module
 from google.appengine.ext.db import polymodel
 from google.appengine.ext.remote_api import remote_api_stub
+from google.appengine.ext.remote_api import throttle as remote_api_throttle
 from google.appengine.runtime import apiproxy_errors
+from google.appengine.tools import adaptive_thread_pool
 from google.appengine.tools import appengine_rpc
+from google.appengine.tools.requeue import ReQueue
 try:
 import sqlite3
 except ImportError:
 pass
 logger = logging.getLogger('google.appengine.tools.bulkloader')
+KeyRange = key_range_module.KeyRange
 DEFAULT_THREAD_COUNT = 10
 DEFAULT_BATCH_SIZE = 10
+DEFAULT_DOWNLOAD_BATCH_SIZE = 100
 DEFAULT_QUEUE_SIZE = DEFAULT_THREAD_COUNT * 10
 _THREAD_SHOULD_EXIT = '_THREAD_SHOULD_EXIT'
 STATE_SENT = 2
 STATE_NOT_SENT = 3
 STATE_GETTING = 1
 STATE_GOT = 2
-STATE_NOT_GOT = 3
+STATE_ERROR = 3
-MINIMUM_THROTTLE_SLEEP_DURATION = 0.001
 DATA_CONSUMED_TO_HERE = 'DATA_CONSUMED_TO_HERE'
 INITIAL_BACKOFF = 1.0
 DEFAULT_RPS_LIMIT = 20
 DEFAULT_REQUEST_LIMIT = 8
-BANDWIDTH_UP = 'http-bandwidth-up'
+MAXIMUM_INCREASE_DURATION = 5.0
-BANDWIDTH_DOWN = 'http-bandwidth-down'
+MAXIMUM_HOLD_DURATION = 12.0
-REQUESTS = 'http-requests'
-HTTPS_BANDWIDTH_UP = 'https-bandwidth-up'
-HTTPS_BANDWIDTH_DOWN = 'https-bandwidth-down'
-HTTPS_REQUESTS = 'https-requests'
-RECORDS = 'records'
-MAXIMUM_INCREASE_DURATION = 8.0
-MAXIMUM_HOLD_DURATION = 10.0
 def ImportStateMessage(state):
 """Converts a numeric state identifier to a status message."""
 return ({
 """Converts a numeric state identifier to a status message."""
 return ({
 STATE_READ: 'Batch read from file.',
 STATE_GETTING: 'Fetching batch from server',
 STATE_GOT: 'Batch successfully fetched.',
-STATE_NOT_GOT: 'Error while fetching batch'
+STATE_ERROR: 'Error while fetching batch'
+}[state])
+def MapStateMessage(state):
+"""Converts a numeric state identifier to a status message."""
+return ({
+STATE_READ: 'Batch read from file.',
+STATE_GETTING: 'Querying for batch from server',
+STATE_GOT: 'Batch successfully fetched.',
+STATE_ERROR: 'Error while fetching or mapping.'
 }[state])
 def ExportStateName(state):
 """Converts a numeric state identifier to a string."""
 return ({
 STATE_READ: 'READ',
 STATE_GETTING: 'GETTING',
 STATE_GOT: 'GOT',
-STATE_NOT_GOT: 'NOT_GOT'
+STATE_ERROR: 'NOT_GOT'
 }[state])
 def ImportStateName(state):
 """Converts a numeric state identifier to a string."""
 return ({
 STATE_READ: 'READ',
 STATE_GETTING: 'SENDING',
 STATE_GOT: 'SENT',
-STATE_NOT_GOT: 'NOT_SENT'
+STATE_NOT_SENT: 'NOT_SENT'
 }[state])
 class Error(Exception):
 """Base-class for exceptions in this module."""
 class FileNotWritableError(Error):
 """A filename passed in by the user refers to a non-writable output file."""
-class KeyRangeError(Error):
-"""Error while trying to generate a KeyRange."""
 class BadStateError(Error):
 """A work item in an unexpected state was encountered."""
+class KeyRangeError(Error):
+"""An error during construction of a KeyRangeItem."""
+class FieldSizeLimitError(Error):
+"""The csv module tried to read a field larger than the size limit."""
+def __init__(self, limit):
+self.message = """
+A field in your CSV input file has exceeded the current limit of %d.
+You can raise this limit by adding the following lines to your config file:
+import csv
+csv.field_size_limit(new_limit)
+where new_limit is number larger than the size in bytes of the largest
+field in your CSV.
+""" % limit
+Error.__init__(self, self.message)
 class NameClashError(Error):
 """A name clash occurred while trying to alias old method names."""
 def __init__(self, old_name, new_name, klass):
 Error.__init__(self, old_name, new_name, klass)
 self.old_name = old_name
 self.new_name = new_name
 self.klass = klass
 def GetCSVGeneratorFactory(kind, csv_filename, batch_size, csv_has_header,
 openfile=open, create_csv_reader=csv.reader):
-"""Return a factory that creates a CSV-based WorkItem generator.
+"""Return a factory that creates a CSV-based UploadWorkItem generator.
 Args:
 kind: The kind of the entities being uploaded.
 csv_filename: File on disk containing CSV data.
-batch_size: Maximum number of CSV rows to stash into a WorkItem.
+batch_size: Maximum number of CSV rows to stash into an UploadWorkItem.
 csv_has_header: Whether to skip the first row of the CSV.
 openfile: Used for dependency injection.
 create_csv_reader: Used for dependency injection.
 Returns:
 A callable (accepting the Progress Queue and Progress Generators
-as input) which creates the WorkItem generator.
+as input) which creates the UploadWorkItem generator.
 """
 loader = Loader.RegisteredLoader(kind)
 loader._Loader__openfile = openfile
 loader._Loader__create_csv_reader = create_csv_reader
 record_generator = loader.generate_records(csv_filename)
-def CreateGenerator(progress_queue, progress_generator):
+def CreateGenerator(request_manager, progress_queue, progress_generator):
-"""Initialize a WorkItem generator linked to a progress generator and queue.
+"""Initialize a UploadWorkItem generator.
 Args:
+request_manager: A RequestManager instance.
 progress_queue: A ProgressQueue instance to send progress information.
 progress_generator: A generator of progress information or None.
 Returns:
-A WorkItemGenerator instance.
+An UploadWorkItemGenerator instance.
 """
-return WorkItemGenerator(progress_queue,
+return UploadWorkItemGenerator(request_manager,
-progress_generator,
+progress_queue,
-record_generator,
+progress_generator,
-csv_has_header,
+record_generator,
-batch_size)
+csv_has_header,
+batch_size)
 return CreateGenerator
-class WorkItemGenerator(object):
+class UploadWorkItemGenerator(object):
-"""Reads rows from a row generator and generates WorkItems of batches."""
+"""Reads rows from a row generator and generates UploadWorkItems."""
 def __init__(self,
+request_manager,
 progress_queue,
 progress_generator,
 record_generator,
 skip_first,
 batch_size):
 """Initialize a WorkItemGenerator.
 Args:
+request_manager: A RequestManager instance with which to associate
+WorkItems.
 progress_queue: A progress queue with which to associate WorkItems.
 progress_generator: A generator of progress information.
 record_generator: A generator of data records.
 skip_first: Whether to skip the first data record.
 batch_size: The number of data records per WorkItem.
 """
+self.request_manager = request_manager
 self.progress_queue = progress_queue
 self.progress_generator = progress_generator
 self.reader = record_generator
 self.skip_first = skip_first
 self.batch_size = batch_size
 (self.column_count, str(row)))
 self.read_rows.append((self.line_number, row))
 self.line_number += 1
 def _MakeItem(self, key_start, key_end, rows, progress_key=None):
-"""Makes a WorkItem containing the given rows, with the given keys.
+"""Makes a UploadWorkItem containing the given rows, with the given keys.
 Args:
-key_start: The start key for the WorkItem.
+key_start: The start key for the UploadWorkItem.
-key_end: The end key for the WorkItem.
+key_end: The end key for the UploadWorkItem.
-rows: A list of the rows for the WorkItem.
+rows: A list of the rows for the UploadWorkItem.
-progress_key: The progress key for the WorkItem
+progress_key: The progress key for the UploadWorkItem
 Returns:
-A WorkItem instance for the given batch.
+An UploadWorkItem instance for the given batch.
 """
 assert rows
-item = WorkItem(self.progress_queue, rows,
+item = UploadWorkItem(self.request_manager, self.progress_queue, rows,
-key_start, key_end,
+key_start, key_end, progress_key=progress_key)
-progress_key=progress_key)
 return item
 def Batches(self):
-"""Reads from the record_generator and generates WorkItems.
+"""Reads from the record_generator and generates UploadWorkItems.
 Yields:
-Instances of class WorkItem
+Instances of class UploadWorkItem
 Raises:
 ResumeError: If the progress database and data file indicate a different
 number of rows.
 """
 ResumeError: If the progress database and data file indicate a different
 number of rows.
 """
 csv_file = self.openfile(self.csv_filename, 'rb')
 reader = self.create_csv_reader(csv_file, skipinitialspace=True)
-return reader
+try:
+for record in reader:
+yield record
-class KeyRangeGenerator(object):
+except csv.Error, e:
+if e.args and e.args[0].startswith('field larger than field limit'):
+limit = e.args[1]
+raise FieldSizeLimitError(limit)
+else:
+raise
+class KeyRangeItemGenerator(object):
 """Generates ranges of keys to download.
 Reads progress information from the progress database and creates
-KeyRange objects corresponding to incompletely downloaded parts of an
+KeyRangeItem objects corresponding to incompletely downloaded parts of an
 export.
 """
-def __init__(self, kind, progress_queue, progress_generator):
+def __init__(self, request_manager, kind, progress_queue, progress_generator,
-"""Initialize the KeyRangeGenerator.
+key_range_item_factory):
+"""Initialize the KeyRangeItemGenerator.
-Args:
+Args:
+request_manager: A RequestManager instance.
 kind: The kind of entities being transferred.
 progress_queue: A queue used for tracking progress information.
 progress_generator: A generator of prior progress information, or None
 if there is no prior status.
-"""
+key_range_item_factory: A factory to produce KeyRangeItems.
+"""
+self.request_manager = request_manager
 self.kind = kind
 self.row_count = 0
 self.xfer_count = 0
 self.progress_queue = progress_queue
 self.progress_generator = progress_generator
+self.key_range_item_factory = key_range_item_factory
 def Batches(self):
 """Iterate through saved progress information.
 Yields:
-KeyRange instances corresponding to undownloaded key ranges.
+KeyRangeItem instances corresponding to undownloaded key ranges.
 """
 if self.progress_generator is not None:
 for progress_key, state, key_start, key_end in self.progress_generator:
 if state is not None and state != STATE_GOT and key_start is not None:
 key_start = ParseKey(key_start)
 key_end = ParseKey(key_end)
-result = KeyRange(self.progress_queue,
+key_range = KeyRange(key_start=key_start,
-self.kind,
+key_end=key_end)
-key_start=key_start,
-key_end=key_end,
+result = self.key_range_item_factory(self.request_manager,
-progress_key=progress_key,
+self.progress_queue,
-direction=KeyRange.ASC,
+self.kind,
-state=STATE_READ)
+key_range,
+progress_key=progress_key,
+state=STATE_READ)
 yield result
 else:
+key_range = KeyRange()
-yield KeyRange(
-self.progress_queue, self.kind,
+yield self.key_range_item_factory(self.request_manager,
-key_start=None,
+self.progress_queue,
-key_end=None,
+self.kind,
-direction=KeyRange.DESC)
+key_range)
-class ReQueue(object):
+class DownloadResult(object):
-"""A special thread-safe queue.
+"""Holds the result of an entity download."""
-A ReQueue allows unfinished work items to be returned with a call to
-reput().  When an item is reput, task_done() should *not* be called
-in addition, getting an item that has been reput does not increase
-the number of outstanding tasks.
-This class shares an interface with Queue.Queue and provides the
-additional reput method.
-"""
-def __init__(self,
-queue_capacity,
-requeue_capacity=None,
-queue_factory=Queue.Queue,
-get_time=time.time):
-"""Initialize a ReQueue instance.
-Args:
-queue_capacity: The number of items that can be put in the ReQueue.
-requeue_capacity: The numer of items that can be reput in the ReQueue.
-queue_factory: Used for dependency injection.
-get_time: Used for dependency injection.
-"""
-if requeue_capacity is None:
-requeue_capacity = queue_capacity
-self.get_time = get_time
-self.queue = queue_factory(queue_capacity)
-self.requeue = queue_factory(requeue_capacity)
-self.lock = threading.Lock()
-self.put_cond = threading.Condition(self.lock)
-self.get_cond = threading.Condition(self.lock)
-def _DoWithTimeout(self,
-action,
-exc,
-wait_cond,
-done_cond,
-lock,
-timeout=None,
-block=True):
-"""Performs the given action with a timeout.
-The action must be non-blocking, and raise an instance of exc on a
-recoverable failure.  If the action fails with an instance of exc,
-we wait on wait_cond before trying again.  Failure after the
-timeout is reached is propagated as an exception.  Success is
-signalled by notifying on done_cond and returning the result of
-the action.  If action raises any exception besides an instance of
-exc, it is immediately propagated.
-Args:
-action: A callable that performs a non-blocking action.
-exc: An exception type that is thrown by the action to indicate
-a recoverable error.
-wait_cond: A condition variable which should be waited on when
-action throws exc.
-done_cond: A condition variable to signal if the action returns.
-lock: The lock used by wait_cond and done_cond.
-timeout: A non-negative float indicating the maximum time to wait.
-block: Whether to block if the action cannot complete immediately.
-Returns:
-The result of the action, if it is successful.
-Raises:
-ValueError: If the timeout argument is negative.
-"""
-if timeout is not None and timeout < 0.0:
-raise ValueError('\'timeout\' must not be a negative  number')
-if not block:
-timeout = 0.0
-result = None
-success = False
-start_time = self.get_time()
-lock.acquire()
-try:
-while not success:
-try:
-result = action()
-success = True
-except Exception, e:
-if not isinstance(e, exc):
-raise e
-if timeout is not None:
-elapsed_time = self.get_time() - start_time
-timeout -= elapsed_time
-if timeout <= 0.0:
-raise e
-wait_cond.wait(timeout)
-finally:
-if success:
-done_cond.notify()
-lock.release()
-return result
-def put(self, item, block=True, timeout=None):
-"""Put an item into the requeue.
-Args:
-item: An item to add to the requeue.
-block: Whether to block if the requeue is full.
-timeout: Maximum on how long to wait until the queue is non-full.
-Raises:
-Queue.Full if the queue is full and the timeout expires.
-"""
-def PutAction():
-self.queue.put(item, block=False)
-self._DoWithTimeout(PutAction,
-Queue.Full,
-self.get_cond,
-self.put_cond,
-self.lock,
-timeout=timeout,
-block=block)
-def reput(self, item, block=True, timeout=None):
-"""Re-put an item back into the requeue.
-Re-putting an item does not increase the number of outstanding
-tasks, so the reput item should be uniquely associated with an
-item that was previously removed from the requeue and for which
-TaskDone has not been called.
-Args:
-item: An item to add to the requeue.
-block: Whether to block if the requeue is full.
-timeout: Maximum on how long to wait until the queue is non-full.
-Raises:
-Queue.Full is the queue is full and the timeout expires.
-"""
-def ReputAction():
-self.requeue.put(item, block=False)
-self._DoWithTimeout(ReputAction,
-Queue.Full,
-self.get_cond,
-self.put_cond,
-self.lock,
-timeout=timeout,
-block=block)
-def get(self, block=True, timeout=None):
-"""Get an item from the requeue.
-Args:
-block: Whether to block if the requeue is empty.
-timeout: Maximum on how long to wait until the requeue is non-empty.
-Returns:
-An item from the requeue.
-Raises:
-Queue.Empty if the queue is empty and the timeout expires.
-"""
-def GetAction():
-try:
-result = self.requeue.get(block=False)
-self.requeue.task_done()
-except Queue.Empty:
-result = self.queue.get(block=False)
-return result
-return self._DoWithTimeout(GetAction,
-Queue.Empty,
-self.put_cond,
-self.get_cond,
-self.lock,
-timeout=timeout,
-block=block)
-def join(self):
-"""Blocks until all of the items in the requeue have been processed."""
-self.queue.join()
-def task_done(self):
-"""Indicate that a previously enqueued item has been fully processed."""
-self.queue.task_done()
-def empty(self):
-"""Returns true if the requeue is empty."""
-return self.queue.empty() and self.requeue.empty()
-def get_nowait(self):
-"""Try to get an item from the queue without blocking."""
-return self.get(block=False)
-def qsize(self):
-return self.queue.qsize() + self.requeue.qsize()
-class ThrottleHandler(urllib2.BaseHandler):
-"""A urllib2 handler for http and https requests that adds to a throttle."""
-def __init__(self, throttle):
-"""Initialize a ThrottleHandler.
-Args:
-throttle: A Throttle instance to call for bandwidth and http/https request
-throttling.
-"""
-self.throttle = throttle
-def AddRequest(self, throttle_name, req):
-"""Add to bandwidth throttle for given request.
-Args:
-throttle_name: The name of the bandwidth throttle to add to.
-req: The request whose size will be added to the throttle.
-"""
-size = 0
-for key, value in req.headers.iteritems():
-size += len('%s: %s\n' % (key, value))
-for key, value in req.unredirected_hdrs.iteritems():
-size += len('%s: %s\n' % (key, value))
-(unused_scheme,
-unused_host_port, url_path,
-unused_query, unused_fragment) = urlparse.urlsplit(req.get_full_url())
-size += len('%s %s HTTP/1.1\n' % (req.get_method(), url_path))
-data = req.get_data()
-if data:
-size += len(data)
-self.throttle.AddTransfer(throttle_name, size)
-def AddResponse(self, throttle_name, res):
-"""Add to bandwidth throttle for given response.
-Args:
-throttle_name: The name of the bandwidth throttle to add to.
-res: The response whose size will be added to the throttle.
-"""
-content = res.read()
-def ReturnContent():
-return content
-res.read = ReturnContent
-size = len(content)
-headers = res.info()
-for key, value in headers.items():
-size += len('%s: %s\n' % (key, value))
-self.throttle.AddTransfer(throttle_name, size)
-def http_request(self, req):
-"""Process an HTTP request.
-If the throttle is over quota, sleep first.  Then add request size to
-throttle before returning it to be sent.
-Args:
-req: A urllib2.Request object.
-Returns:
-The request passed in.
-"""
-self.throttle.Sleep()
-self.AddRequest(BANDWIDTH_UP, req)
-return req
-def https_request(self, req):
-"""Process an HTTPS request.
-If the throttle is over quota, sleep first.  Then add request size to
-throttle before returning it to be sent.
-Args:
-req: A urllib2.Request object.
-Returns:
-The request passed in.
-"""
-self.throttle.Sleep()
-self.AddRequest(HTTPS_BANDWIDTH_UP, req)
-return req
-def http_response(self, unused_req, res):
-"""Process an HTTP response.
-The size of the response is added to the bandwidth throttle and the request
-throttle is incremented by one.
-Args:
-unused_req: The urllib2 request for this response.
-res: A urllib2 response object.
-Returns:
-The response passed in.
-"""
-self.AddResponse(BANDWIDTH_DOWN, res)
-self.throttle.AddTransfer(REQUESTS, 1)
-return res
-def https_response(self, unused_req, res):
-"""Process an HTTPS response.
-The size of the response is added to the bandwidth throttle and the request
-throttle is incremented by one.
-Args:
-unused_req: The urllib2 request for this response.
-res: A urllib2 response object.
-Returns:
-The response passed in.
-"""
-self.AddResponse(HTTPS_BANDWIDTH_DOWN, res)
-self.throttle.AddTransfer(HTTPS_REQUESTS, 1)
-return res
-class ThrottledHttpRpcServer(appengine_rpc.HttpRpcServer):
-"""Provides a simplified RPC-style interface for HTTP requests.
-This RPC server uses a Throttle to prevent exceeding quotas.
-"""
-def __init__(self, throttle, request_manager, *args, **kwargs):
-"""Initialize a ThrottledHttpRpcServer.
-Also sets request_manager.rpc_server to the ThrottledHttpRpcServer instance.
-Args:
-throttle: A Throttles instance.
-request_manager: A RequestManager instance.
-args: Positional arguments to pass through to
-appengine_rpc.HttpRpcServer.__init__
-kwargs: Keyword arguments to pass through to
-appengine_rpc.HttpRpcServer.__init__
-"""
-self.throttle = throttle
-appengine_rpc.HttpRpcServer.__init__(self, *args, **kwargs)
-request_manager.rpc_server = self
-def _GetOpener(self):
-"""Returns an OpenerDirector that supports cookies and ignores redirects.
-Returns:
-A urllib2.OpenerDirector object.
-"""
-opener = appengine_rpc.HttpRpcServer._GetOpener(self)
-opener.add_handler(ThrottleHandler(self.throttle))
-return opener
-def ThrottledHttpRpcServerFactory(throttle, request_manager):
-"""Create a factory to produce ThrottledHttpRpcServer for a given throttle.
-Args:
-throttle: A Throttle instance to use for the ThrottledHttpRpcServer.
-request_manager: A RequestManager instance.
-Returns:
-A factory to produce a ThrottledHttpRpcServer.
-"""
-def MakeRpcServer(*args, **kwargs):
-"""Factory to produce a ThrottledHttpRpcServer.
-Args:
-args: Positional args to pass to ThrottledHttpRpcServer.
-kwargs: Keyword args to pass to ThrottledHttpRpcServer.
-Returns:
-A ThrottledHttpRpcServer instance.
-"""
-kwargs['account_type'] = 'HOSTED_OR_GOOGLE'
-kwargs['save_cookies'] = True
-return ThrottledHttpRpcServer(throttle, request_manager, *args, **kwargs)
-return MakeRpcServer
-class ExportResult(object):
-"""Holds the decoded content for the result of an export requests."""
 def __init__(self, continued, direction, keys, entities):
 self.continued = continued
 self.direction = direction
 self.keys = keys
 self.entities = entities
 self.count = len(keys)
 assert self.count == len(entities)
-assert direction in (KeyRange.ASC, KeyRange.DESC)
+assert direction in (key_range_module.KeyRange.ASC,
+key_range_module.KeyRange.DESC)
 if self.count > 0:
-if direction == KeyRange.ASC:
+if direction == key_range_module.KeyRange.ASC:
 self.key_start = keys[0]
 self.key_end = keys[-1]
 else:
 self.key_start = keys[-1]
 self.key_end = keys[0]
+def Entities(self):
+"""Returns the list of entities for this result in key order."""
+if self.direction == key_range_module.KeyRange.ASC:
+return list(self.entities)
+else:
+result = list(self.entities)
+result.reverse()
+return result
 def __str__(self):
 return 'continued = %s\n%s' % (
-str(self.continued), '\n'.join(self.entities))
+str(self.continued), '\n'.join(str(self.entities)))
-class _WorkItem(object):
+class _WorkItem(adaptive_thread_pool.WorkItem):
 """Holds a description of a unit of upload or download work."""
 def __init__(self, progress_queue, key_start, key_end, state_namer,
 state=STATE_READ, progress_key=None):
 """Initialize the _WorkItem instance.
 Args:
 progress_queue: A queue used for tracking progress information.
-key_start: The starting key, inclusive.
+key_start: The start key of the work item.
-key_end: The ending key, inclusive.
+key_end: The end key of the work item.
 state_namer: Function to describe work item states.
 state: The initial state of the work item.
 progress_key: If this WorkItem represents state from a prior run,
 then this will be the key within the progress database.
 """
+adaptive_thread_pool.WorkItem.__init__(self,
+'[%s-%s]' % (key_start, key_end))
 self.progress_queue = progress_queue
-self.key_start = key_start
-self.key_end = key_end
 self.state_namer = state_namer
 self.state = state
 self.progress_key = progress_key
 self.progress_event = threading.Event()
+self.key_start = key_start
+self.key_end = key_end
+self.error = None
+self.traceback = None
+def _TransferItem(self, thread_pool):
+raise NotImplementedError()
+def SetError(self):
+"""Sets the error and traceback information for this thread.
+This must be called from an exception handler.
+"""
+if not self.error:
+exc_info = sys.exc_info()
+self.error = exc_info[1]
+self.traceback = exc_info[2]
+def PerformWork(self, thread_pool):
+"""Perform the work of this work item and report the results.
+Args:
+thread_pool: An AdaptiveThreadPool instance.
+Returns:
+A tuple (status, instruction) of the work status and an instruction
+for the ThreadGate.
+"""
+status = adaptive_thread_pool.WorkItem.FAILURE
+instruction = adaptive_thread_pool.ThreadGate.DECREASE
+try:
+self.MarkAsTransferring()
+try:
+transfer_time = self._TransferItem(thread_pool)
+if transfer_time is None:
+status = adaptive_thread_pool.WorkItem.RETRY
+instruction = adaptive_thread_pool.ThreadGate.HOLD
+else:
+logger.debug('[%s] %s Transferred %d entities in %0.1f seconds',
+threading.currentThread().getName(), self, self.count,
+transfer_time)
+sys.stdout.write('.')
+sys.stdout.flush()
+status = adaptive_thread_pool.WorkItem.SUCCESS
+if transfer_time <= MAXIMUM_INCREASE_DURATION:
+instruction = adaptive_thread_pool.ThreadGate.INCREASE
+elif transfer_time <= MAXIMUM_HOLD_DURATION:
+instruction = adaptive_thread_pool.ThreadGate.HOLD
+except (db.InternalError, db.NotSavedError, db.Timeout,
+db.TransactionFailedError,
+apiproxy_errors.OverQuotaError,
+apiproxy_errors.DeadlineExceededError,
+apiproxy_errors.ApplicationError), e:
+status = adaptive_thread_pool.WorkItem.RETRY
+logger.exception('Retrying on non-fatal datastore error: %s', e)
+except urllib2.HTTPError, e:
+http_status = e.code
+if http_status == 403 or (http_status >= 500 and http_status < 600):
+status = adaptive_thread_pool.WorkItem.RETRY
+logger.exception('Retrying on non-fatal HTTP error: %d %s',
+http_status, e.msg)
+else:
+self.SetError()
+status = adaptive_thread_pool.WorkItem.FAILURE
+except urllib2.URLError, e:
+if IsURLErrorFatal(e):
+self.SetError()
+status = adaptive_thread_pool.WorkItem.FAILURE
+else:
+status = adaptive_thread_pool.WorkItem.RETRY
+logger.exception('Retrying on non-fatal URL error: %s', e.reason)
+finally:
+if status == adaptive_thread_pool.WorkItem.SUCCESS:
+self.MarkAsTransferred()
+else:
+self.MarkAsError()
+return (status, instruction)
 def _AssertInState(self, *states):
 """Raises an Error if the state of this range is not in states."""
 if not self.state in states:
 raise BadStateError('%s:%s not in %s' %
 self._AssertInState(STATE_READ)
 self._StateTransition(STATE_READ, blocking=True)
 def MarkAsTransferring(self):
 """Mark this _WorkItem as transferring, updating the progress database."""
-self._AssertInState(STATE_READ, STATE_NOT_GOT)
+self._AssertInState(STATE_READ, STATE_ERROR)
 self._AssertProgressKey()
 self._StateTransition(STATE_GETTING, blocking=True)
 def MarkAsTransferred(self):
 """Mark this _WorkItem as transferred, updating the progress database."""
 def MarkAsError(self):
 """Mark this _WorkItem as failed, updating the progress database."""
 self._AssertInState(STATE_GETTING)
 self._AssertProgressKey()
-self._StateTransition(STATE_NOT_GOT, blocking=True)
+self._StateTransition(STATE_ERROR, blocking=True)
 def _StateTransition(self, new_state, blocking=False):
 """Transition the work item to a new state, storing progress information.
 Args:
 self.progress_event.clear()
-class WorkItem(_WorkItem):
+class UploadWorkItem(_WorkItem):
 """Holds a unit of uploading work.
-A WorkItem represents a number of entities that need to be uploaded to
+A UploadWorkItem represents a number of entities that need to be uploaded to
 Google App Engine. These entities are encoded in the "content" field of
-the WorkItem, and will be POST'd as-is to the server.
+the UploadWorkItem, and will be POST'd as-is to the server.
 The entities are identified by a range of numeric keys, inclusively. In
 the case of a resumption of an upload, or a replay to correct errors,
 these keys must be able to identify the same set of entities.
 Note that keys specify a range. The entities do not have to sequentially
 fill the entire range, they must simply bound a range of valid keys.
 """
-def __init__(self, progress_queue, rows, key_start, key_end,
+def __init__(self, request_manager, progress_queue, rows, key_start, key_end,
 progress_key=None):
-"""Initialize the WorkItem instance.
+"""Initialize the UploadWorkItem instance.
 Args:
+request_manager: A RequestManager instance.
 progress_queue: A queue used for tracking progress information.
 rows: A list of pairs of a line number and a list of column values
 key_start: The (numeric) starting key, inclusive.
 key_end: The (numeric) ending key, inclusive.
-progress_key: If this WorkItem represents state from a prior run,
+progress_key: If this UploadWorkItem represents state from a prior run,
 then this will be the key within the progress database.
 """
 _WorkItem.__init__(self, progress_queue, key_start, key_end,
 ImportStateName, state=STATE_READ,
 progress_key=progress_key)
 assert isinstance(key_start, (int, long))
 assert isinstance(key_end, (int, long))
 assert key_start <= key_end
+self.request_manager = request_manager
 self.rows = rows
 self.content = None
 self.count = len(rows)
 def __str__(self):
 return '[%s-%s]' % (self.key_start, self.key_end)
+def _TransferItem(self, thread_pool, get_time=time.time):
+"""Transfers the entities associated with an item.
+Args:
+thread_pool: An AdaptiveThreadPool instance.
+get_time: Used for dependency injection.
+"""
+t = get_time()
+if not self.content:
+self.content = self.request_manager.EncodeContent(self.rows)
+try:
+self.request_manager.PostEntities(self.content)
+except:
+raise
+return get_time() - t
 def MarkAsTransferred(self):
-"""Mark this WorkItem as sucessfully-sent to the server."""
+"""Mark this UploadWorkItem as sucessfully-sent to the server."""
 self._AssertInState(STATE_SENDING)
 self._AssertProgressKey()
 self._StateTransition(STATE_SENT, blocking=False)
 kind_or_class_key)
 else:
 implementation_class = db.class_for_kind(kind_or_class_key)
 return implementation_class
-class EmptyQuery(db.Query):
-def get(self):
-return None
-def fetch(self, limit=1000, offset=0):
-return []
-def count(self, limit=1000):
-return 0
 def KeyLEQ(key1, key2):
 """Compare two keys for less-than-or-equal-to.
-All keys with numeric ids come before all keys with names.
+All keys with numeric ids come before all keys with names. None represents
+an unbounded end-point so it is both greater and less than any other key.
 Args:
-key1: An int or db.Key instance.
+key1: An int or datastore.Key instance.
-key2: An int or db.Key instance.
+key2: An int or datastore.Key instance.
 Returns:
 True if key1 <= key2
 """
-if isinstance(key1, int) and isinstance(key2, int):
-return key1 <= key2
 if key1 is None or key2 is None:
 return True
-if key1.id() and not key2.id():
+return key1 <= key2
-return True
-return key1.id_or_name() <= key2.id_or_name()
+class KeyRangeItem(_WorkItem):
+"""Represents an item of work that scans over a key range.
-class KeyRange(_WorkItem):
-"""Represents an item of download work.
+A KeyRangeItem object represents holds a KeyRange
+and has an associated state: STATE_READ, STATE_GETTING, STATE_GOT,
-A KeyRange object represents a key range (key_start, key_end) and a
+and STATE_ERROR.
-scan direction (KeyRange.DESC or KeyRange.ASC).  The KeyRange object
-has an associated state: STATE_READ, STATE_GETTING, STATE_GOT, and
-STATE_ERROR.
 - STATE_READ indicates the range ready to be downloaded by a worker thread.
 - STATE_GETTING indicates the range is currently being downloaded.
 - STATE_GOT indicates that the range was successfully downloaded
 - STATE_ERROR indicates that an error occurred during the last download
 attempt
-KeyRanges not in the STATE_GOT state are stored in the progress database.
+KeyRangeItems not in the STATE_GOT state are stored in the progress database.
-When a piece of KeyRange work is downloaded, the download may cover only
+When a piece of KeyRangeItem work is downloaded, the download may cover only
-a portion of the range.  In this case, the old KeyRange is removed from
+a portion of the range.  In this case, the old KeyRangeItem is removed from
 the progress database and ranges covering the undownloaded range are
 generated and stored as STATE_READ in the export progress database.
 """
-DESC = 0
-ASC = 1
-MAX_KEY_LEN = 500
 def __init__(self,
+request_manager,
 progress_queue,
 kind,
-direction,
+key_range,
-key_start=None,
-key_end=None,
-include_start=True,
-include_end=True,
 progress_key=None,
 state=STATE_READ):
-"""Initialize a KeyRange object.
+"""Initialize a KeyRangeItem object.
 Args:
+request_manager: A RequestManager instance.
 progress_queue: A queue used for tracking progress information.
 kind: The kind of entities for this range.
-direction: The direction of the query for this range.
+key_range: A KeyRange instance for this work item.
-key_start: The starting key for this range.
-key_end: The ending key for this range.
-include_start: Whether the start key should be included in the range.
-include_end: Whether the end key should be included in the range.
 progress_key: The key for this range within the progress database.
 state: The initial state of this range.
+"""
-Raises:
+_WorkItem.__init__(self, progress_queue, key_range.key_start,
-KeyRangeError: if key_start is None.
+key_range.key_end, ExportStateName, state=state,
-"""
+progress_key=progress_key)
-assert direction in (KeyRange.ASC, KeyRange.DESC)
+self.request_manager = request_manager
-_WorkItem.__init__(self, progress_queue, key_start, key_end,
-ExportStateName, state=state, progress_key=progress_key)
 self.kind = kind
-self.direction = direction
+self.key_range = key_range
-self.export_result = None
+self.download_result = None
 self.count = 0
-self.include_start = include_start
+self.key_start = key_range.key_start
-self.include_end = include_end
+self.key_end = key_range.key_end
-self.SPLIT_KEY = db.Key.from_path(self.kind, unichr(0))
 def __str__(self):
-return '[%s-%s]' % (PrettyKey(self.key_start), PrettyKey(self.key_end))
+return str(self.key_range)
 def __repr__(self):
 return self.__str__()
 def MarkAsTransferred(self):
-"""Mark this KeyRange as transferred, updating the progress database."""
+"""Mark this KeyRangeItem as transferred, updating the progress database."""
 pass
-def Process(self, export_result, num_threads, batch_size, work_queue):
+def Process(self, download_result, thread_pool, batch_size,
-"""Mark this KeyRange as success, updating the progress database.
+new_state=STATE_GOT):
+"""Mark this KeyRangeItem as success, updating the progress database.
-Process will split this KeyRange based on the content of export_result and
-adds the unfinished ranges to the work queue.
+Process will split this KeyRangeItem based on the content of
+download_result and adds the unfinished ranges to the work queue.
-Args:
-export_result: An ExportResult instance.
+Args:
-num_threads: The number of threads for parallel transfers.
+download_result: A DownloadResult instance.
+thread_pool: An AdaptiveThreadPool instance.
 batch_size: The number of entities to transfer per request.
-work_queue: The work queue to add unfinished ranges to.
+new_state: The state to transition the completed range to.
-Returns:
-A list of KeyRanges representing undownloaded datastore key ranges.
 """
 self._AssertInState(STATE_GETTING)
 self._AssertProgressKey()
-self.export_result = export_result
+self.download_result = download_result
-self.count = len(export_result.keys)
+self.count = len(download_result.keys)
-if export_result.continued:
+if download_result.continued:
-self._FinishedRange()._StateTransition(STATE_GOT, blocking=True)
+self._FinishedRange()._StateTransition(new_state, blocking=True)
-self._AddUnfinishedRanges(num_threads, batch_size, work_queue)
+self._AddUnfinishedRanges(thread_pool, batch_size)
 else:
-self._StateTransition(STATE_GOT, blocking=True)
+self._StateTransition(new_state, blocking=True)
 def _FinishedRange(self):
-"""Returns the range completed by the export_result.
+"""Returns the range completed by the download_result.
 Returns:
-A KeyRange representing a completed range.
+A KeyRangeItem representing a completed range.
 """
-assert self.export_result is not None
+assert self.download_result is not None
-if self.direction == KeyRange.ASC:
+if self.key_range.direction == key_range_module.KeyRange.ASC:
-key_start = self.key_start
+key_start = self.key_range.key_start
-if self.export_result.continued:
+if self.download_result.continued:
-key_end = self.export_result.key_end
+key_end = self.download_result.key_end
 else:
-key_end = self.key_end
+key_end = self.key_range.key_end
 else:
-key_end = self.key_end
+key_end = self.key_range.key_end
-if self.export_result.continued:
+if self.download_result.continued:
-key_start = self.export_result.key_start
+key_start = self.download_result.key_start
 else:
-key_start = self.key_start
+key_start = self.key_range.key_start
-result = KeyRange(self.progress_queue,
+key_range = KeyRange(key_start=key_start,
-self.kind,
+key_end=key_end,
-key_start=key_start,
+direction=self.key_range.direction)
-key_end=key_end,
-direction=self.direction)
+result = self.__class__(self.request_manager,
+self.progress_queue,
-result.progress_key = self.progress_key
+self.kind,
-result.export_result = self.export_result
+key_range,
-result.state = self.state
+progress_key=self.progress_key,
+state=self.state)
+result.download_result = self.download_result
 result.count = self.count
 return result
-def FilterQuery(self, query):
+def _SplitAndAddRanges(self, thread_pool, batch_size):
-"""Add query filter to restrict to this key range.
+"""Split the key range [key_start, key_end] into a list of ranges."""
+if self.download_result.direction == key_range_module.KeyRange.ASC:
-Args:
+key_range = KeyRange(
-query: A db.Query instance.
+key_start=self.download_result.key_end,
-"""
+key_end=self.key_range.key_end,
-if self.key_start == self.key_end and not (
+include_start=False)
-self.include_start or self.include_end):
-return EmptyQuery()
-if self.include_start:
-start_comparator = '>='
 else:
-start_comparator = '>'
+key_range = KeyRange(
-if self.include_end:
+key_start=self.key_range.key_start,
-end_comparator = '<='
+key_end=self.download_result.key_start,
+include_end=False)
+if thread_pool.QueuedItemCount() > 2 * thread_pool.num_threads():
+ranges = [key_range]
 else:
-end_comparator = '<'
+ranges = key_range.split_range(batch_size=batch_size)
-if self.key_start and self.key_end:
-query.filter('__key__ %s' % start_comparator, self.key_start)
-query.filter('__key__ %s' % end_comparator, self.key_end)
-elif self.key_start:
-query.filter('__key__ %s' % start_comparator, self.key_start)
-elif self.key_end:
-query.filter('__key__ %s' % end_comparator, self.key_end)
-return query
-def MakeParallelQuery(self):
-"""Construct a query for this key range, for parallel downloading.
-Returns:
-A db.Query instance.
-Raises:
-KeyRangeError: if self.direction is not one of
-KeyRange.ASC, KeyRange.DESC
-"""
-if self.direction == KeyRange.ASC:
-direction = ''
-elif self.direction == KeyRange.DESC:
-direction = '-'
-else:
-raise KeyRangeError('KeyRange direction unexpected: %s', self.direction)
-query = db.Query(GetImplementationClass(self.kind))
-query.order('%s__key__' % direction)
-return self.FilterQuery(query)
-def MakeSerialQuery(self):
-"""Construct a query for this key range without descending __key__ scan.
-Returns:
-A db.Query instance.
-"""
-query = db.Query(GetImplementationClass(self.kind))
-query.order('__key__')
-return self.FilterQuery(query)
-def _BisectStringRange(self, start, end):
-if start == end:
-return (start, start, end)
-start += '\0'
-end += '\0'
-midpoint = []
-expected_max = 127
-for i in xrange(min(len(start), len(end))):
-if start[i] == end[i]:
-midpoint.append(start[i])
-else:
-ord_sum = ord(start[i]) + ord(end[i])
-midpoint.append(unichr(ord_sum / 2))
-if ord_sum % 2:
-if len(start) > i + 1:
-ord_start = ord(start[i+1])
-else:
-ord_start = 0
-if ord_start < expected_max:
-ord_split = (expected_max + ord_start) / 2
-else:
-ord_split = (0xFFFF + ord_start) / 2
-midpoint.append(unichr(ord_split))
-break
-return (start[:-1], ''.join(midpoint), end[:-1])
-def SplitRange(self, key_start, include_start, key_end, include_end,
-export_result, num_threads, batch_size, work_queue):
-"""Split the key range [key_start, key_end] into a list of ranges."""
-if export_result.direction == KeyRange.ASC:
-key_start = export_result.key_end
-include_start = False
-else:
-key_end = export_result.key_start
-include_end = False
-key_pairs = []
-if not key_start:
-key_pairs.append((key_start, include_start, key_end, include_end,
-KeyRange.ASC))
-elif not key_end:
-key_pairs.append((key_start, include_start, key_end, include_end,
-KeyRange.DESC))
-elif work_queue.qsize() > 2 * num_threads:
-key_pairs.append((key_start, include_start, key_end, include_end,
-KeyRange.ASC))
-elif key_start.id() and key_end.id():
-if key_end.id() - key_start.id() > batch_size:
-key_half = db.Key.from_path(self.kind,
-(key_start.id() + key_end.id()) / 2)
-key_pairs.append((key_start, include_start,
-key_half, True,
-KeyRange.DESC))
-key_pairs.append((key_half, False,
-key_end, include_end,
-KeyRange.ASC))
-else:
-key_pairs.append((key_start, include_start, key_end, include_end,
-KeyRange.ASC))
-elif key_start.name() and key_end.name():
-(start, middle, end) = self._BisectStringRange(key_start.name(),
-key_end.name())
-key_pairs.append((key_start, include_start,
-db.Key.from_path(self.kind, middle), True,
-KeyRange.DESC))
-key_pairs.append((db.Key.from_path(self.kind, middle), False,
-key_end, include_end,
-KeyRange.ASC))
-else:
-assert key_start.id() and key_end.name()
-key_pairs.append((key_start, include_start,
-self.SPLIT_KEY, False,
-KeyRange.DESC))
-key_pairs.append((self.SPLIT_KEY, True,
-key_end, include_end,
-KeyRange.ASC))
-ranges = [KeyRange(self.progress_queue,
-self.kind,
-key_start=start,
-include_start=include_start,
-key_end=end,
-include_end=include_end,
-direction=direction)
-for (start, include_start, end, include_end, direction)
-in key_pairs]
 for key_range in ranges:
-key_range.MarkAsRead()
+key_range_item = self.__class__(self.request_manager,
-work_queue.put(key_range, block=True)
+self.progress_queue,
+self.kind,
-def _AddUnfinishedRanges(self, num_threads, batch_size, work_queue):
+key_range)
-"""Adds incomplete KeyRanges to the work_queue.
+key_range_item.MarkAsRead()
+thread_pool.SubmitItem(key_range_item, block=True)
-Args:
-num_threads: The number of threads for parallel transfers.
+def _AddUnfinishedRanges(self, thread_pool, batch_size):
+"""Adds incomplete KeyRanges to the thread_pool.
+Args:
+thread_pool: An AdaptiveThreadPool instance.
 batch_size: The number of entities to transfer per request.
-work_queue: The work queue to add unfinished ranges to.
 Returns:
 A list of KeyRanges representing incomplete datastore key ranges.
 Raises:
 KeyRangeError: if this key range has already been completely transferred.
 """
-assert self.export_result is not None
+assert self.download_result is not None
-if self.export_result.continued:
+if self.download_result.continued:
-self.SplitRange(self.key_start, self.include_start, self.key_end,
+self._SplitAndAddRanges(thread_pool, batch_size)
-self.include_end, self.export_result,
-num_threads, batch_size, work_queue)
 else:
 raise KeyRangeError('No unfinished part of key range.')
+class DownloadItem(KeyRangeItem):
+"""A KeyRangeItem for downloading key ranges."""
+def _TransferItem(self, thread_pool, get_time=time.time):
+"""Transfers the entities associated with an item."""
+t = get_time()
+download_result = self.request_manager.GetEntities(self)
+transfer_time = get_time() - t
+self.Process(download_result, thread_pool,
+self.request_manager.batch_size)
+return transfer_time
+class MapperItem(KeyRangeItem):
+"""A KeyRangeItem for mapping over key ranges."""
+def _TransferItem(self, thread_pool, get_time=time.time):
+t = get_time()
+download_result = self.request_manager.GetEntities(self)
+transfer_time = get_time() - t
+mapper = self.request_manager.GetMapper()
+try:
+mapper.batch_apply(download_result.Entities())
+except MapperRetry:
+return None
+self.Process(download_result, thread_pool,
+self.request_manager.batch_size)
+return transfer_time
 class RequestManager(object):
 """A class which wraps a connection to the server."""
 kind,
 throttle,
 batch_size,
 secure,
 email,
-passin):
+passin,
+dry_run=False):
 """Initialize a RequestManager object.
 Args:
 app_id: String containing the application id for requests.
 host_port: String containing the "host:port" pair; the port is optional.
 self.authenticated = False
 self.auth_called = False
 self.parallel_download = True
 self.email = email
 self.passin = passin
-throttled_rpc_server_factory = ThrottledHttpRpcServerFactory(
+self.mapper = None
-self.throttle, self)
+self.dry_run = dry_run
+if self.dry_run:
+logger.info('Running in dry run mode, skipping remote_api setup')
+return
 logger.debug('Configuring remote_api. url_path = %s, '
 'servername = %s' % (url_path, host_port))
+def CookieHttpRpcServer(*args, **kwargs):
+kwargs['save_cookies'] = True
+kwargs['account_type'] = 'HOSTED_OR_GOOGLE'
+return appengine_rpc.HttpRpcServer(*args, **kwargs)
 remote_api_stub.ConfigureRemoteDatastore(
 app_id,
 url_path,
 self.AuthFunction,
 servername=host_port,
-rpc_server_factory=throttled_rpc_server_factory,
+rpc_server_factory=CookieHttpRpcServer,
 secure=self.secure)
+remote_api_throttle.ThrottleRemoteDatastore(self.throttle)
 logger.debug('Bulkloader using app_id: %s', os.environ['APPLICATION_ID'])
 def Authenticate(self):
 """Invoke authentication if necessary."""
-logger.info('Connecting to %s', self.url_path)
+logger.info('Connecting to %s%s', self.host_port, self.url_path)
-self.rpc_server.Send(self.url_path, payload=None)
+if self.dry_run:
+self.authenticated = True
+return
+remote_api_stub.MaybeInvokeAuthentication()
 self.authenticated = True
 def AuthFunction(self,
 raw_input_fn=raw_input,
 password_input_fn=getpass.getpass):
 Args:
 rows: A list of pairs of a line number and a list of column values.
 loader: Used for dependency injection.
 Returns:
-A list of db.Model instances.
+A list of datastore.Entity instances.
 Raises:
 ConfigurationError: if no loader is defined for self.kind
 """
 if not loader:
 logger.error('No Loader defined for kind %s.' % self.kind)
 raise ConfigurationError('No Loader defined for kind %s.' % self.kind)
 entities = []
 for line_number, values in rows:
 key = loader.generate_key(line_number, values)
-if isinstance(key, db.Key):
+if isinstance(key, datastore.Key):
 parent = key.parent()
 key = key.name()
 else:
 parent = None
 entity = loader.create_entity(values, key_name=key, parent=parent)
+def ToEntity(entity):
+if isinstance(entity, db.Model):
+return entity._populate_entity()
+else:
+return entity
 if isinstance(entity, list):
-entities.extend(entity)
+entities.extend(map(ToEntity, entity))
 elif entity:
-entities.append(entity)
+entities.append(ToEntity(entity))
 return entities
-def PostEntities(self, item):
+def PostEntities(self, entities):
 """Posts Entity records to a remote endpoint over HTTP.
 Args:
-item: A workitem containing the entities to post.
+entities: A list of datastore entities.
+"""
+if self.dry_run:
+return
+datastore.Put(entities)
+def _QueryForPbs(self, query):
+"""Perform the given query and return a list of entity_pb's."""
+try:
+query_pb = query._ToPb(limit=self.batch_size)
+result_pb = datastore_pb.QueryResult()
+apiproxy_stub_map.MakeSyncCall('datastore_v3', 'RunQuery', query_pb,
+result_pb)
+next_pb = datastore_pb.NextRequest()
+next_pb.set_count(self.batch_size)
+next_pb.mutable_cursor().CopyFrom(result_pb.cursor())
+result_pb = datastore_pb.QueryResult()
+apiproxy_stub_map.MakeSyncCall('datastore_v3', 'Next', next_pb, result_pb)
+return result_pb.result_list()
+except apiproxy_errors.ApplicationError, e:
+raise datastore._ToDatastoreError(e)
+def GetEntities(self, key_range_item, key_factory=datastore.Key):
+"""Gets Entity records from a remote endpoint over HTTP.
+Args:
+key_range_item: Range of keys to get.
+key_factory: Used for dependency injection.
 Returns:
-A pair of the estimated size of the request in bytes and the response
+A DownloadResult instance.
-from the server as a str.
-"""
-entities = item.content
-db.put(entities)
-def GetEntities(self, key_range):
-"""Gets Entity records from a remote endpoint over HTTP.
-Args:
-key_range: Range of keys to get.
-Returns:
-An ExportResult instance.
 Raises:
 ConfigurationError: if no Exporter is defined for self.kind
 """
-try:
-Exporter.RegisteredExporter(self.kind)
-except KeyError:
-raise ConfigurationError('No Exporter defined for kind %s.' % self.kind)
 keys = []
 entities = []
 if self.parallel_download:
-query = key_range.MakeParallelQuery()
+query = key_range_item.key_range.make_directed_datastore_query(self.kind)
 try:
-results = query.fetch(self.batch_size)
+results = self._QueryForPbs(query)
 except datastore_errors.NeedIndexError:
 logger.info('%s: No descending index on __key__, '
 'performing serial download', self.kind)
 self.parallel_download = False
 if not self.parallel_download:
-key_range.direction = KeyRange.ASC
+key_range_item.key_range.direction = key_range_module.KeyRange.ASC
-query = key_range.MakeSerialQuery()
+query = key_range_item.key_range.make_ascending_datastore_query(self.kind)
-results = query.fetch(self.batch_size)
+results = self._QueryForPbs(query)
 size = len(results)
-for model in results:
+for entity in results:
-key = model.key()
+key = key_factory()
-entities.append(cPickle.dumps(model))
+key._Key__reference = entity.key()
+entities.append(entity)
 keys.append(key)
 continued = (size == self.batch_size)
-key_range.count = size
+key_range_item.count = size
-return ExportResult(continued, key_range.direction, keys, entities)
+return DownloadResult(continued, key_range_item.key_range.direction,
+keys, entities)
+def GetMapper(self):
+"""Returns a mapper for the registered kind.
+Returns:
+A Mapper instance.
+Raises:
+ConfigurationError: if no Mapper is defined for self.kind
+"""
+if not self.mapper:
+try:
+self.mapper = Mapper.RegisteredMapper(self.kind)
+except KeyError:
+logger.error('No Mapper defined for kind %s.' % self.kind)
+raise ConfigurationError('No Mapper defined for kind %s.' % self.kind)
+return self.mapper
 def InterruptibleSleep(sleep_time):
 """Puts thread to sleep, checking this threads exit_flag twice a second.
 slept += this_sleep_time
 if thread.exit_flag:
 return
-class ThreadGate(object):
-"""Manage the number of active worker threads.
-The ThreadGate limits the number of threads that are simultaneously
-uploading batches of records in order to implement adaptive rate
-control.  The number of simultaneous upload threads that it takes to
-start causing timeout varies widely over the course of the day, so
-adaptive rate control allows the uploader to do many uploads while
-reducing the error rate and thus increasing the throughput.
-Initially the ThreadGate allows only one uploader thread to be active.
-For each successful upload, another thread is activated and for each
-failed upload, the number of active threads is reduced by one.
-"""
-def __init__(self, enabled,
-threshhold1=MAXIMUM_INCREASE_DURATION,
-threshhold2=MAXIMUM_HOLD_DURATION,
-sleep=InterruptibleSleep):
-"""Constructor for ThreadGate instances.
-Args:
-enabled: Whether the thread gate is enabled
-threshhold1: Maximum duration (in seconds) for a transfer to increase
-the number of active threads.
-threshhold2: Maximum duration (in seconds) for a transfer to not decrease
-the number of active threads.
-"""
-self.enabled = enabled
-self.enabled_count = 1
-self.lock = threading.Lock()
-self.thread_semaphore = threading.Semaphore(self.enabled_count)
-self._threads = []
-self.backoff_time = 0
-self.sleep = sleep
-self.threshhold1 = threshhold1
-self.threshhold2 = threshhold2
-def Register(self, thread):
-"""Register a thread with the thread gate."""
-self._threads.append(thread)
-def Threads(self):
-"""Yields the registered threads."""
-for thread in self._threads:
-yield thread
-def EnableThread(self):
-"""Enable one more worker thread."""
-self.lock.acquire()
-try:
-self.enabled_count += 1
-finally:
-self.lock.release()
-self.thread_semaphore.release()
-def EnableAllThreads(self):
-"""Enable all worker threads."""
-for unused_idx in xrange(len(self._threads) - self.enabled_count):
-self.EnableThread()
-def StartWork(self):
-"""Starts a critical section in which the number of workers is limited.
-If thread throttling is enabled then this method starts a critical
-section which allows self.enabled_count simultaneously operating
-threads. The critical section is ended by calling self.FinishWork().
-"""
-if self.enabled:
-self.thread_semaphore.acquire()
-if self.backoff_time > 0.0:
-if not threading.currentThread().exit_flag:
-logger.info('Backing off: %.1f seconds',
-self.backoff_time)
-self.sleep(self.backoff_time)
-def FinishWork(self):
-"""Ends a critical section started with self.StartWork()."""
-if self.enabled:
-self.thread_semaphore.release()
-def TransferSuccess(self, duration):
-"""Informs the throttler that an item was successfully sent.
-If thread throttling is enabled and the duration is low enough, this
-method will cause an additional thread to run in the critical section.
-Args:
-duration: The duration of the transfer in seconds.
-"""
-if duration > self.threshhold2:
-logger.debug('Transfer took %s, decreasing workers.', duration)
-self.DecreaseWorkers(backoff=False)
-return
-elif duration > self.threshhold1:
-logger.debug('Transfer took %s, not increasing workers.', duration)
-return
-elif self.enabled:
-if self.backoff_time > 0.0:
-logger.info('Resetting backoff to 0.0')
-self.backoff_time = 0.0
-do_enable = False
-self.lock.acquire()
-try:
-if self.enabled and len(self._threads) > self.enabled_count:
-do_enable = True
-self.enabled_count += 1
-finally:
-self.lock.release()
-if do_enable:
-logger.debug('Increasing active thread count to %d',
-self.enabled_count)
-self.thread_semaphore.release()
-def DecreaseWorkers(self, backoff=True):
-"""Informs the thread_gate that an item failed to send.
-If thread throttling is enabled, this method will cause the
-throttler to allow one fewer thread in the critical section. If
-there is only one thread remaining, failures will result in
-exponential backoff until there is a success.
-Args:
-backoff: Whether to increase exponential backoff if there is only
-one thread enabled.
-"""
-if self.enabled:
-do_disable = False
-self.lock.acquire()
-try:
-if self.enabled:
-if self.enabled_count > 1:
-do_disable = True
-self.enabled_count -= 1
-elif backoff:
-if self.backoff_time == 0.0:
-self.backoff_time = INITIAL_BACKOFF
-else:
-self.backoff_time *= BACKOFF_FACTOR
-finally:
-self.lock.release()
-if do_disable:
-logger.debug('Decreasing the number of active threads to %d',
-self.enabled_count)
-self.thread_semaphore.acquire()
-class Throttle(object):
-"""A base class for upload rate throttling.
-Transferring large number of records, too quickly, to an application
-could trigger quota limits and cause the transfer process to halt.
-In order to stay within the application's quota, we throttle the
-data transfer to a specified limit (across all transfer threads).
-This limit defaults to about half of the Google App Engine default
-for an application, but can be manually adjusted faster/slower as
-appropriate.
-This class tracks a moving average of some aspect of the transfer
-rate (bandwidth, records per second, http connections per
-second). It keeps two windows of counts of bytes transferred, on a
-per-thread basis. One block is the "current" block, and the other is
-the "prior" block. It will rotate the counts from current to prior
-when ROTATE_PERIOD has passed.  Thus, the current block will
-represent from 0 seconds to ROTATE_PERIOD seconds of activity
-(determined by: time.time() - self.last_rotate).  The prior block
-will always represent a full ROTATE_PERIOD.
-Sleeping is performed just before a transfer of another block, and is
-based on the counts transferred *before* the next transfer. It really
-does not matter how much will be transferred, but only that for all the
-data transferred SO FAR that we have interspersed enough pauses to
-ensure the aggregate transfer rate is within the specified limit.
-These counts are maintained on a per-thread basis, so we do not require
-any interlocks around incrementing the counts. There IS an interlock on
-the rotation of the counts because we do not want multiple threads to
-multiply-rotate the counts.
-There are various race conditions in the computation and collection
-of these counts. We do not require precise values, but simply to
-keep the overall transfer within the bandwidth limits. If a given
-pause is a little short, or a little long, then the aggregate delays
-will be correct.
-"""
-ROTATE_PERIOD = 600
-def __init__(self,
-get_time=time.time,
-thread_sleep=InterruptibleSleep,
-layout=None):
-self.get_time = get_time
-self.thread_sleep = thread_sleep
-self.start_time = get_time()
-self.transferred = {}
-self.prior_block = {}
-self.totals = {}
-self.throttles = {}
-self.last_rotate = {}
-self.rotate_mutex = {}
-if layout:
-self.AddThrottles(layout)
-def AddThrottle(self, name, limit):
-self.throttles[name] = limit
-self.transferred[name] = {}
-self.prior_block[name] = {}
-self.totals[name] = {}
-self.last_rotate[name] = self.get_time()
-self.rotate_mutex[name] = threading.Lock()
-def AddThrottles(self, layout):
-for key, value in layout.iteritems():
-self.AddThrottle(key, value)
-def Register(self, thread):
-"""Register this thread with the throttler."""
-thread_name = thread.getName()
-for throttle_name in self.throttles.iterkeys():
-self.transferred[throttle_name][thread_name] = 0
-self.prior_block[throttle_name][thread_name] = 0
-self.totals[throttle_name][thread_name] = 0
-def VerifyName(self, throttle_name):
-if throttle_name not in self.throttles:
-raise AssertionError('%s is not a registered throttle' % throttle_name)
-def AddTransfer(self, throttle_name, token_count):
-"""Add a count to the amount this thread has transferred.
-Each time a thread transfers some data, it should call this method to
-note the amount sent. The counts may be rotated if sufficient time
-has passed since the last rotation.
-Note: this method should only be called by the BulkLoaderThread
-instances. The token count is allocated towards the
-"current thread".
-Args:
-throttle_name: The name of the throttle to add to.
-token_count: The number to add to the throttle counter.
-"""
-self.VerifyName(throttle_name)
-transferred = self.transferred[throttle_name]
-transferred[threading.currentThread().getName()] += token_count
-if self.last_rotate[throttle_name] + self.ROTATE_PERIOD < self.get_time():
-self._RotateCounts(throttle_name)
-def Sleep(self, throttle_name=None):
-"""Possibly sleep in order to limit the transfer rate.
-Note that we sleep based on *prior* transfers rather than what we
-may be about to transfer. The next transfer could put us under/over
-and that will be rectified *after* that transfer. Net result is that
-the average transfer rate will remain within bounds. Spiky behavior
-or uneven rates among the threads could possibly bring the transfer
-rate above the requested limit for short durations.
-Args:
-throttle_name: The name of the throttle to sleep on.  If None or
-omitted, then sleep on all throttles.
-"""
-if throttle_name is None:
-for throttle_name in self.throttles:
-self.Sleep(throttle_name=throttle_name)
-return
-self.VerifyName(throttle_name)
-thread = threading.currentThread()
-while True:
-duration = self.get_time() - self.last_rotate[throttle_name]
-total = 0
-for count in self.prior_block[throttle_name].values():
-total += count
-if total:
-duration += self.ROTATE_PERIOD
-for count in self.transferred[throttle_name].values():
-total += count
-sleep_time = (float(total) / self.throttles[throttle_name]) - duration
-if sleep_time < MINIMUM_THROTTLE_SLEEP_DURATION:
-break
-logger.debug('[%s] Throttling on %s. Sleeping for %.1f ms '
-'(duration=%.1f ms, total=%d)',
-thread.getName(), throttle_name,
-sleep_time * 1000, duration * 1000, total)
-self.thread_sleep(sleep_time)
-if thread.exit_flag:
-break
-self._RotateCounts(throttle_name)
-def _RotateCounts(self, throttle_name):
-"""Rotate the transfer counters.
-If sufficient time has passed, then rotate the counters from active to
-the prior-block of counts.
-This rotation is interlocked to ensure that multiple threads do not
-over-rotate the counts.
-Args:
-throttle_name: The name of the throttle to rotate.
-"""
-self.VerifyName(throttle_name)
-self.rotate_mutex[throttle_name].acquire()
-try:
-next_rotate_time = self.last_rotate[throttle_name] + self.ROTATE_PERIOD
-if next_rotate_time >= self.get_time():
-return
-for name, count in self.transferred[throttle_name].items():
-self.prior_block[throttle_name][name] = count
-self.transferred[throttle_name][name] = 0
-self.totals[throttle_name][name] += count
-self.last_rotate[throttle_name] = self.get_time()
-finally:
-self.rotate_mutex[throttle_name].release()
-def TotalTransferred(self, throttle_name):
-"""Return the total transferred, and over what period.
-Args:
-throttle_name: The name of the throttle to total.
-Returns:
-A tuple of the total count and running time for the given throttle name.
-"""
-total = 0
-for count in self.totals[throttle_name].values():
-total += count
-for count in self.transferred[throttle_name].values():
-total += count
-return total, self.get_time() - self.start_time
 class _ThreadBase(threading.Thread):
 """Provide some basic features for the threads used in the uploader.
 This abstract base class is used to provide some common features:
 self.setDaemon(True)
 self.exit_flag = False
 self.error = None
+self.traceback = None
 def run(self):
 """Perform the work of the thread."""
-logger.info('[%s] %s: started', self.getName(), self.__class__.__name__)
+logger.debug('[%s] %s: started', self.getName(), self.__class__.__name__)
 try:
 self.PerformWork()
 except:
-self.error = sys.exc_info()[1]
+self.SetError()
 logger.exception('[%s] %s:', self.getName(), self.__class__.__name__)
-logger.info('[%s] %s: exiting', self.getName(), self.__class__.__name__)
+logger.debug('[%s] %s: exiting', self.getName(), self.__class__.__name__)
+def SetError(self):
+"""Sets the error and traceback information for this thread.
+This must be called from an exception handler.
+"""
+if not self.error:
+exc_info = sys.exc_info()
+self.error = exc_info[1]
+self.traceback = exc_info[2]
 def PerformWork(self):
 """Perform the thread-specific work."""
 raise NotImplementedError()
 def CheckError(self):
 """If an error is present, then log it."""
 if self.error:
 logger.error('Error in %s: %s', self.GetFriendlyName(), self.error)
+if self.traceback:
+logger.debug(''.join(traceback.format_exception(self.error.__class__,
+self.error,
+self.traceback)))
 def GetFriendlyName(self):
 """Returns a human-friendly description of the thread."""
 if hasattr(self, 'NAME'):
 return self.NAME
 if not isinstance(error.reason[0], int):
 return True
 return error.reason[0] not in non_fatal_error_codes
-def PrettyKey(key):
-"""Returns a nice string representation of the given key."""
-if key is None:
-return None
-elif isinstance(key, db.Key):
-return repr(key.id_or_name())
-return str(key)
-class _BulkWorkerThread(_ThreadBase):
-"""A base class for worker threads.
-This thread will read WorkItem instances from the work_queue and upload
-the entities to the server application. Progress information will be
-pushed into the progress_queue as the work is being performed.
-If a _BulkWorkerThread encounters a transient error, the entities will be
-resent, if a fatal error is encoutered the BulkWorkerThread exits.
-Subclasses must provide implementations for PreProcessItem, TransferItem,
-and ProcessResponse.
-"""
-def __init__(self,
-work_queue,
-throttle,
-thread_gate,
-request_manager,
-num_threads,
-batch_size,
-state_message,
-get_time):
-"""Initialize the BulkLoaderThread instance.
-Args:
-work_queue: A queue containing WorkItems for processing.
-throttle: A Throttles to control upload bandwidth.
-thread_gate: A ThreadGate to control number of simultaneous uploads.
-request_manager: A RequestManager instance.
-num_threads: The number of threads for parallel transfers.
-batch_size: The number of entities to transfer per request.
-state_message: Used for dependency injection.
-get_time: Used for dependency injection.
-"""
-_ThreadBase.__init__(self)
-self.work_queue = work_queue
-self.throttle = throttle
-self.thread_gate = thread_gate
-self.request_manager = request_manager
-self.num_threads = num_threads
-self.batch_size = batch_size
-self.state_message = state_message
-self.get_time = get_time
-def PreProcessItem(self, item):
-"""Performs pre transfer processing on a work item."""
-raise NotImplementedError()
-def TransferItem(self, item):
-"""Transfers the entities associated with an item.
-Args:
-item: An item of upload (WorkItem) or download (KeyRange) work.
-Returns:
-A tuple of (estimated transfer size, response)
-"""
-raise NotImplementedError()
-def ProcessResponse(self, item, result):
-"""Processes the response from the server application."""
-raise NotImplementedError()
-def PerformWork(self):
-"""Perform the work of a _BulkWorkerThread."""
-while not self.exit_flag:
-transferred = False
-self.thread_gate.StartWork()
-try:
-try:
-item = self.work_queue.get(block=True, timeout=1.0)
-except Queue.Empty:
-continue
-if item == _THREAD_SHOULD_EXIT:
-break
-logger.debug('[%s] Got work item %s', self.getName(), item)
-try:
-item.MarkAsTransferring()
-self.PreProcessItem(item)
-response = None
-try:
-try:
-t = self.get_time()
-response = self.TransferItem(item)
-status = 200
-transferred = True
-transfer_time = self.get_time() - t
-logger.debug('[%s] %s Transferred %d entities in %0.1f seconds',
-self.getName(), item, item.count, transfer_time)
-self.throttle.AddTransfer(RECORDS, item.count)
-except (db.InternalError, db.NotSavedError, db.Timeout,
-apiproxy_errors.OverQuotaError,
-apiproxy_errors.DeadlineExceededError), e:
-logger.exception('Caught non-fatal datastore error: %s', e)
-except urllib2.HTTPError, e:
-status = e.code
-if status == 403 or (status >= 500 and status < 600):
-logger.exception('Caught non-fatal HTTP error: %d %s',
-status, e.msg)
-else:
-raise e
-except urllib2.URLError, e:
-if IsURLErrorFatal(e):
-raise e
-else:
-logger.exception('Caught non-fatal URL error: %s', e.reason)
-self.ProcessResponse(item, response)
-except:
-self.error = sys.exc_info()[1]
-logger.exception('[%s] %s: caught exception %s', self.getName(),
-self.__class__.__name__, str(sys.exc_info()))
-raise
-finally:
-if transferred:
-item.MarkAsTransferred()
-self.work_queue.task_done()
-self.thread_gate.TransferSuccess(transfer_time)
-else:
-item.MarkAsError()
-try:
-self.work_queue.reput(item, block=False)
-except Queue.Full:
-logger.error('[%s] Failed to reput work item.', self.getName())
-raise Error('Failed to reput work item')
-self.thread_gate.DecreaseWorkers()
-logger.info('%s %s',
-item,
-self.state_message(item.state))
-finally:
-self.thread_gate.FinishWork()
-def GetFriendlyName(self):
-"""Returns a human-friendly name for this thread."""
-return 'worker [%s]' % self.getName()
-class BulkLoaderThread(_BulkWorkerThread):
-"""A thread which transmits entities to the server application.
-This thread will read WorkItem instances from the work_queue and upload
-the entities to the server application. Progress information will be
-pushed into the progress_queue as the work is being performed.
-If a BulkLoaderThread encounters a transient error, the entities will be
-resent, if a fatal error is encoutered the BulkLoaderThread exits.
-"""
-def __init__(self,
-work_queue,
-throttle,
-thread_gate,
-request_manager,
-num_threads,
-batch_size,
-get_time=time.time):
-"""Initialize the BulkLoaderThread instance.
-Args:
-work_queue: A queue containing WorkItems for processing.
-throttle: A Throttles to control upload bandwidth.
-thread_gate: A ThreadGate to control number of simultaneous uploads.
-request_manager: A RequestManager instance.
-num_threads: The number of threads for parallel transfers.
-batch_size: The number of entities to transfer per request.
-get_time: Used for dependency injection.
-"""
-_BulkWorkerThread.__init__(self,
-work_queue,
-throttle,
-thread_gate,
-request_manager,
-num_threads,
-batch_size,
-ImportStateMessage,
-get_time)
-def PreProcessItem(self, item):
-"""Performs pre transfer processing on a work item."""
-if item and not item.content:
-item.content = self.request_manager.EncodeContent(item.rows)
-def TransferItem(self, item):
-"""Transfers the entities associated with an item.
-Args:
-item: An item of upload (WorkItem) work.
-Returns:
-A tuple of (estimated transfer size, response)
-"""
-return self.request_manager.PostEntities(item)
-def ProcessResponse(self, item, response):
-"""Processes the response from the server application."""
-pass
-class BulkExporterThread(_BulkWorkerThread):
-"""A thread which recieved entities to the server application.
-This thread will read KeyRange instances from the work_queue and export
-the entities from the server application. Progress information will be
-pushed into the progress_queue as the work is being performed.
-If a BulkExporterThread encounters an error when trying to post data,
-the thread will exit and cause the application to terminate.
-"""
-def __init__(self,
-work_queue,
-throttle,
-thread_gate,
-request_manager,
-num_threads,
-batch_size,
-get_time=time.time):
-"""Initialize the BulkExporterThread instance.
-Args:
-work_queue: A queue containing KeyRanges for processing.
-throttle: A Throttles to control upload bandwidth.
-thread_gate: A ThreadGate to control number of simultaneous uploads.
-request_manager: A RequestManager instance.
-num_threads: The number of threads for parallel transfers.
-batch_size: The number of entities to transfer per request.
-get_time: Used for dependency injection.
-"""
-_BulkWorkerThread.__init__(self,
-work_queue,
-throttle,
-thread_gate,
-request_manager,
-num_threads,
-batch_size,
-ExportStateMessage,
-get_time)
-def PreProcessItem(self, unused_item):
-"""Performs pre transfer processing on a work item."""
-pass
-def TransferItem(self, item):
-"""Transfers the entities associated with an item.
-Args:
-item: An item of download (KeyRange) work.
-Returns:
-A tuple of (estimated transfer size, response)
-"""
-return self.request_manager.GetEntities(item)
-def ProcessResponse(self, item, export_result):
-"""Processes the response from the server application."""
-if export_result:
-item.Process(export_result, self.num_threads, self.batch_size,
-self.work_queue)
-item.state = STATE_GOT
 class DataSourceThread(_ThreadBase):
 """A thread which reads WorkItems and pushes them into queue.
 This thread will read/consume WorkItems from a generator (produced by
 the generator factory). These WorkItems will then be pushed into the
-work_queue. Note that reading will block if/when the work_queue becomes
+thread_pool. Note that reading will block if/when the thread_pool becomes
 full. Information on content consumed from the generator will be pushed
 into the progress_queue.
 """
 NAME = 'data source thread'
 def __init__(self,
-work_queue,
+request_manager,
+thread_pool,
 progress_queue,
 workitem_generator_factory,
 progress_generator_factory):
 """Initialize the DataSourceThread instance.
 Args:
-work_queue: A queue containing WorkItems for processing.
+request_manager: A RequestManager instance.
+thread_pool: An AdaptiveThreadPool instance.
 progress_queue: A queue used for tracking progress information.
 workitem_generator_factory: A factory that creates a WorkItem generator
 progress_generator_factory: A factory that creates a generator which
 produces prior progress status, or None if there is no prior status
 to use.
 """
 _ThreadBase.__init__(self)
-self.work_queue = work_queue
+self.request_manager = request_manager
+self.thread_pool = thread_pool
 self.progress_queue = progress_queue
 self.workitem_generator_factory = workitem_generator_factory
 self.progress_generator_factory = progress_generator_factory
 self.entity_count = 0
 if self.progress_generator_factory:
 progress_gen = self.progress_generator_factory()
 else:
 progress_gen = None
-content_gen = self.workitem_generator_factory(self.progress_queue,
+content_gen = self.workitem_generator_factory(self.request_manager,
+self.progress_queue,
 progress_gen)
 self.xfer_count = 0
 self.read_count = 0
 self.read_all = False
 for item in content_gen.Batches():
 item.MarkAsRead()
 while not self.exit_flag:
 try:
-self.work_queue.put(item, block=True, timeout=1.0)
+self.thread_pool.SubmitItem(item, block=True, timeout=1.0)
 self.entity_count += item.count
 break
 except Queue.Full:
 pass
 self.insert_cursor = self.secondary_conn.cursor()
 self.update_cursor = self.secondary_conn.cursor()
+zero_matcher = re.compile(r'\x00')
+zero_one_matcher = re.compile(r'\x00\x01')
+def KeyStr(key):
+"""Returns a string to represent a key, preserving ordering.
+Unlike datastore.Key.__str__(), we have the property:
+key1 < key2 ==> KeyStr(key1) < KeyStr(key2)
+The key string is constructed from the key path as follows:
+(1) Strings are prepended with ':' and numeric id's are padded to
+20 digits.
+(2) Any null characters (u'\0') present are replaced with u'\0\1'
+(3) The sequence u'\0\0' is used to separate each component of the path.
+(1) assures that names and ids compare properly, while (2) and (3) enforce
+the part-by-part comparison of pieces of the path.
+Args:
+key: A datastore.Key instance.
+Returns:
+A string representation of the key, which preserves ordering.
+"""
+assert isinstance(key, datastore.Key)
+path = key.to_path()
+out_path = []
+for part in path:
+if isinstance(part, (int, long)):
+part = '%020d' % part
+else:
+part = ':%s' % part
+out_path.append(zero_matcher.sub(u'\0\1', part))
+out_str = u'\0\0'.join(out_path)
+return out_str
+def StrKey(key_str):
+"""The inverse of the KeyStr function.
+Args:
+key_str: A string in the range of KeyStr.
+Returns:
+A datastore.Key instance k, such that KeyStr(k) == key_str.
+"""
+parts = key_str.split(u'\0\0')
+for i in xrange(len(parts)):
+if parts[i][0] == ':':
+part = parts[i][1:]
+part = zero_one_matcher.sub(u'\0', part)
+parts[i] = part
+else:
+parts[i] = int(parts[i])
+return datastore.Key.from_path(*parts)
 class ResultDatabase(_Database):
 """Persistently record all the entities downloaded during an export.
 The entities are held in the database by their unique datastore key
 in order to avoid duplication if an export is restarted.
 used to make sure we are not using an old database.
 commit_periodicity: How many operations to perform between commits.
 """
 self.complete = False
 create_table = ('create table result (\n'
-'id TEXT primary key,\n'
+'id BLOB primary key,\n'
 'value BLOB not null)')
 _Database.__init__(self,
 db_filename,
 create_table,
 self.existing_count = int(cursor.fetchone()[0])
 else:
 self.existing_count = 0
 self.count = self.existing_count
-def _StoreEntity(self, entity_id, value):
+def _StoreEntity(self, entity_id, entity):
 """Store an entity in the result database.
 Args:
-entity_id: A db.Key for the entity.
+entity_id: A datastore.Key for the entity.
-value: A string of the contents of the entity.
+entity: The entity to store.
 Returns:
 True if this entities is not already present in the result database.
 """
 assert _RunningInThread(self.secondary_thread)
-assert isinstance(entity_id, db.Key)
+assert isinstance(entity_id, datastore.Key), (
+'expected a datastore.Key, got a %s' % entity_id.__class__.__name__)
-entity_id = entity_id.id_or_name()
+key_str = buffer(KeyStr(entity_id).encode('utf-8'))
 self.insert_cursor.execute(
-'select count(*) from result where id = ?', (unicode(entity_id),))
+'select count(*) from result where id = ?', (key_str,))
 already_present = self.insert_cursor.fetchone()[0]
 result = True
 if already_present:
 result = False
 self.insert_cursor.execute('delete from result where id = ?',
-(unicode(entity_id),))
+(key_str,))
 else:
 self.count += 1
+value = entity.Encode()
 self.insert_cursor.execute(
 'insert into result (id, value) values (?, ?)',
-(unicode(entity_id), buffer(value)))
+(key_str, buffer(value)))
 return result
 def StoreEntities(self, keys, entities):
 """Store a group of entities in the result database.
 The number of new entities stored in the result database.
 """
 self._OpenSecondaryConnection()
 t = time.time()
 count = 0
-for entity_id, value in zip(keys,
+for entity_id, entity in zip(keys,
 entities):
-if self._StoreEntity(entity_id, value):
+if self._StoreEntity(entity_id, entity):
 count += 1
 logger.debug('%s insert: delta=%.3f',
 self.db_filename,
 time.time() - t)
 logger.debug('Entities transferred total: %s', self.count)
 cursor.execute(
 'select id, value from result order by id')
 for unused_entity_id, entity in cursor:
-yield cPickle.loads(str(entity))
+entity_proto = entity_pb.EntityProto(contents=entity)
+yield datastore.Entity._FromPb(entity_proto)
 class _ProgressDatabase(_Database):
 """Persistently record all progress information during an upload.
 A string to later be used as a unique key to update this state.
 """
 self._OpenSecondaryConnection()
 assert _RunningInThread(self.secondary_thread)
-assert not key_start or isinstance(key_start, self.py_type)
+assert (not key_start) or isinstance(key_start, self.py_type), (
-assert not key_end or isinstance(key_end, self.py_type), '%s is a %s' % (
+'%s is a %s, %s expected %s' % (key_start,
-key_end, key_end.__class__)
+key_start.__class__,
+self.__class__.__name__,
+self.py_type))
+assert (not key_end) or isinstance(key_end, self.py_type), (
+'%s is a %s, %s expected %s' % (key_end,
+key_end.__class__,
+self.__class__.__name__,
+self.py_type))
 assert KeyLEQ(key_start, key_end), '%s not less than %s' % (
 repr(key_start), repr(key_end))
 self.insert_cursor.execute(
 'insert into progress (state, key_start, key_end) values (?, ?, ?)',
 def __init__(self, db_filename, signature):
 """Initialize an ExportProgressDatabase."""
 _ProgressDatabase.__init__(self,
 db_filename,
 'TEXT',
-db.Key,
+datastore.Key,
 signature,
 commit_periodicity=1)
 def UseProgressData(self):
 """Check if the progress database contains progress data.
 """Write the contents of the result database."""
 exporter = Exporter.RegisteredExporter(self.kind)
 exporter.output_entities(self.result_db.AllEntities())
 def UpdateProgress(self, item):
-"""Update the state of the given KeyRange.
+"""Update the state of the given KeyRangeItem.
 Args:
 item: A KeyRange instance.
 """
 if item.state == STATE_GOT:
-count = self.result_db.StoreEntities(item.export_result.keys,
+count = self.result_db.StoreEntities(item.download_result.keys,
-item.export_result.entities)
+item.download_result.entities)
 self.db.DeleteKey(item.progress_key)
 self.entities_transferred += count
 else:
 self.db.UpdateState(item.progress_key, item.state)
+class MapperProgressThread(_ProgressThreadBase):
+"""A thread to record progress information for maps over the datastore."""
+def __init__(self, kind, progress_queue, progress_db):
+"""Initialize the MapperProgressThread instance.
+Args:
+kind: The kind of entities being stored in the database.
+progress_queue: A Queue used for tracking progress information.
+progress_db: The database for tracking progress information; should
+be an instance of ProgressDatabase.
+"""
+_ProgressThreadBase.__init__(self, progress_queue, progress_db)
+self.kind = kind
+self.mapper = Mapper.RegisteredMapper(self.kind)
+def EntitiesTransferred(self):
+"""Return the total number of unique entities transferred."""
+return self.entities_transferred
+def WorkFinished(self):
+"""Perform actions after map is complete."""
+pass
+def UpdateProgress(self, item):
+"""Update the state of the given KeyRangeItem.
+Args:
+item: A KeyRange instance.
+"""
+if item.state == STATE_GOT:
+self.entities_transferred += item.count
+self.db.DeleteKey(item.progress_key)
+else:
+self.db.UpdateState(item.progress_key, item.state)
 def ParseKey(key_string):
-"""Turn a key stored in the database into a db.Key or None.
+"""Turn a key stored in the database into a Key or None.
 Args:
-key_string: The string representation of a db.Key.
+key_string: The string representation of a Key.
 Returns:
-A db.Key instance or None
+A datastore.Key instance or None
 """
 if not key_string:
 return None
 if key_string == 'None':
 return None
-return db.Key(encoded=key_string)
+return datastore.Key(encoded=key_string)
 def Validate(value, typ):
 """Checks that value is non-empty and of the right type.
 __properties = None
 def __init__(self, kind, properties):
 """Constructor.
-Populates this Loader's kind and properties map. Also registers it with
+Populates this Loader's kind and properties map.
-the bulk loader, so that all you need to do is instantiate your Loader,
-and the bulkload handler will automatically use it.
 Args:
 kind: a string containing the entity kind that this loader handles
 properties: list of (name, converter) tuples.
 self.__properties = properties
 @staticmethod
 def RegisterLoader(loader):
+"""Register loader and the Loader instance for its kind.
+Args:
+loader: A Loader instance.
+"""
 Loader.__loaders[loader.kind] = loader
 def alias_old_names(self):
 """Aliases method names so that Loaders defined with old names work."""
 aliases = (
 """Creates a entity from a list of property values.
 Args:
 values: list/tuple of str
 key_name: if provided, the name for the (single) resulting entity
-parent: A db.Key instance for the parent, or None
+parent: A datastore.Key instance for the parent, or None
 Returns:
 list of db.Model
 The returned entities are populated with the property values from the
 This method can be overridden to control the key generation for
 uploaded entities. The value returned should be None (to use a
 server generated numeric key), or a string which neither starts
 with a digit nor has the form __*__ (see
 http://code.google.com/appengine/docs/python/datastore/keysandentitygroups.html),
-or a db.Key instance.
+or a datastore.Key instance.
 If you generate your own string keys, keep in mind:
 1. The key name for each entity must be unique.
 2. If an entity of the same kind and key already exists in the
 def RegisteredLoader(kind):
 """Returns the loader instance for the given kind if it exists."""
 return Loader.__loaders[kind]
+class RestoreThread(_ThreadBase):
+"""A thread to read saved entity_pbs from sqlite3."""
+NAME = 'RestoreThread'
+_ENTITIES_DONE = 'Entities Done'
+def __init__(self, queue, filename):
+_ThreadBase.__init__(self)
+self.queue = queue
+self.filename = filename
+def PerformWork(self):
+db_conn = sqlite3.connect(self.filename)
+cursor = db_conn.cursor()
+cursor.execute('select id, value from result')
+for entity_id, value in cursor:
+self.queue.put([entity_id, value], block=True)
+self.queue.put(RestoreThread._ENTITIES_DONE, block=True)
+class RestoreLoader(Loader):
+"""A Loader which imports protobuffers from a file."""
+def __init__(self, kind):
+self.kind = kind
+def initialize(self, filename, loader_opts):
+CheckFile(filename)
+self.queue = Queue.Queue(1000)
+restore_thread = RestoreThread(self.queue, filename)
+restore_thread.start()
+def generate_records(self, filename):
+while True:
+record = self.queue.get(block=True)
+if id(record) == id(RestoreThread._ENTITIES_DONE):
+break
+yield record
+def create_entity(self, values, key_name=None, parent=None):
+key = StrKey(unicode(values[0], 'utf-8'))
+entity_proto = entity_pb.EntityProto(contents=str(values[1]))
+entity_proto.mutable_key().CopyFrom(key._Key__reference)
+return datastore.Entity._FromPb(entity_proto)
 class Exporter(object):
 """A base class for serializing datastore entities.
 To add a handler for exporting an entity kind from your datastore,
 write a subclass of this class that calls Exporter.__init__ from your
 __properties = None
 def __init__(self, kind, properties):
 """Constructor.
-Populates this Exporters's kind and properties map. Also registers
+Populates this Exporters's kind and properties map.
-it so that all you need to do is instantiate your Exporter, and
-the bulkload handler will automatically use it.
 Args:
 kind: a string containing the entity kind that this exporter handles
 properties: list of (name, converter, default) tuples.
 self.__properties = properties
 @staticmethod
 def RegisterExporter(exporter):
+"""Register exporter and the Exporter instance for its kind.
+Args:
+exporter: A Exporter instance.
+"""
 Exporter.__exporters[exporter.kind] = exporter
 def __ExtractProperties(self, entity):
 """Converts an entity into a list of string values.
 MissingPropertyError: if an expected field on the entity is missing.
 """
 encoding = []
 for name, fn, default in self.__properties:
 try:
-encoding.append(fn(getattr(entity, name)))
+encoding.append(fn(entity[name]))
 except AttributeError:
 if default is None:
 raise MissingPropertyError(name)
 else:
 encoding.append(default)
 def RegisteredExporter(kind):
 """Returns an exporter instance for the given kind if it exists."""
 return Exporter.__exporters[kind]
+class DumpExporter(Exporter):
+"""An exporter which dumps protobuffers to a file."""
+def __init__(self, kind, result_db_filename):
+self.kind = kind
+self.result_db_filename = result_db_filename
+def output_entities(self, entity_generator):
+shutil.copyfile(self.result_db_filename, self.output_filename)
+class MapperRetry(Error):
+"""An exception that indicates a non-fatal error during mapping."""
+class Mapper(object):
+"""A base class for serializing datastore entities.
+To add a handler for exporting an entity kind from your datastore,
+write a subclass of this class that calls Mapper.__init__ from your
+class's __init__.
+You need to implement to batch_apply or apply method on your subclass
+for the map to do anything.
+"""
+__mappers = {}
+kind = None
+def __init__(self, kind):
+"""Constructor.
+Populates this Mappers's kind.
+Args:
+kind: a string containing the entity kind that this mapper handles
+"""
+Validate(kind, basestring)
+self.kind = kind
+GetImplementationClass(kind)
+@staticmethod
+def RegisterMapper(mapper):
+"""Register mapper and the Mapper instance for its kind.
+Args:
+mapper: A Mapper instance.
+"""
+Mapper.__mappers[mapper.kind] = mapper
+def initialize(self, mapper_opts):
+"""Performs initialization.
+Args:
+mapper_opts: The string given as the --mapper_opts flag argument.
+"""
+pass
+def finalize(self):
+"""Performs finalization actions after the download completes."""
+pass
+def apply(self, entity):
+print 'Default map function doing nothing to %s' % entity
+def batch_apply(self, entities):
+for entity in entities:
+self.apply(entity)
+@staticmethod
+def RegisteredMappers():
+"""Returns a dictionary of the mapper instances that have been created."""
+return dict(Mapper.__mappers)
+@staticmethod
+def RegisteredMapper(kind):
+"""Returns an mapper instance for the given kind if it exists."""
+return Mapper.__mappers[kind]
 class QueueJoinThread(threading.Thread):
 """A thread that joins a queue and exits.
 Queue joins do not have a timeout.  To simulate a queue join with
 timeout, run this thread and join it with a timeout.
 self.queue.join()
 def InterruptibleQueueJoin(queue,
 thread_local,
-thread_gate,
+thread_pool,
 queue_join_thread_factory=QueueJoinThread,
 check_workers=True):
 """Repeatedly joins the given ReQueue or Queue.Queue with short timeout.
 Between each timeout on the join, worker threads are checked.
 Args:
 queue: A Queue.Queue or ReQueue instance.
 thread_local: A threading.local instance which indicates interrupts.
-thread_gate: A ThreadGate instance.
+thread_pool: An AdaptiveThreadPool instance.
 queue_join_thread_factory: Used for dependency injection.
 check_workers: Whether to interrupt the join on worker death.
 Returns:
 True unless the queue join is interrupted by SIGINT or worker death.
 return True
 if thread_local.shut_down:
 logger.debug('Queue join interrupted')
 return False
 if check_workers:
-for worker_thread in thread_gate.Threads():
+for worker_thread in thread_pool.Threads():
 if not worker_thread.isAlive():
 return False
-def ShutdownThreads(data_source_thread, work_queue, thread_gate):
+def ShutdownThreads(data_source_thread, thread_pool):
 """Shuts down the worker and data source threads.
 Args:
 data_source_thread: A running DataSourceThread instance.
-work_queue: The work queue.
+thread_pool: An AdaptiveThreadPool instance with workers registered.
-thread_gate: A ThreadGate instance with workers registered.
 """
 logger.info('An error occurred. Shutting down...')
 data_source_thread.exit_flag = True
-for thread in thread_gate.Threads():
+thread_pool.Shutdown()
-thread.exit_flag = True
-for unused_thread in thread_gate.Threads():
-thread_gate.EnableThread()
 data_source_thread.join(timeout=3.0)
 if data_source_thread.isAlive():
 logger.warn('%s hung while trying to exit',
 data_source_thread.GetFriendlyName())
-while not work_queue.empty():
-try:
-unused_item = work_queue.get_nowait()
-work_queue.task_done()
-except Queue.Empty:
-pass
 class BulkTransporterApp(object):
 """Class to wrap bulk transport application functionality."""
 def __init__(self,
 arg_dict,
 input_generator_factory,
 throttle,
 progress_db,
-workerthread_factory,
 progresstrackerthread_factory,
 max_queue_size=DEFAULT_QUEUE_SIZE,
 request_manager_factory=RequestManager,
 datasourcethread_factory=DataSourceThread,
-work_queue_factory=ReQueue,
+progress_queue_factory=Queue.Queue,
-progress_queue_factory=Queue.Queue):
+thread_pool_factory=adaptive_thread_pool.AdaptiveThreadPool):
 """Instantiate a BulkTransporterApp.
 Uploads or downloads data to or from application using HTTP requests.
 When run, the class will spin up a number of threads to read entities
 from the data source, pass those to a number of worker threads
 Args:
 arg_dict: Dictionary of command line options.
 input_generator_factory: A factory that creates a WorkItem generator.
 throttle: A Throttle instance.
 progress_db: The database to use for replaying/recording progress.
-workerthread_factory: A factory for worker threads.
 progresstrackerthread_factory: Used for dependency injection.
 max_queue_size: Maximum size of the queues before they should block.
 request_manager_factory: Used for dependency injection.
 datasourcethread_factory: Used for dependency injection.
-work_queue_factory: Used for dependency injection.
 progress_queue_factory: Used for dependency injection.
+thread_pool_factory: Used for dependency injection.
 """
 self.app_id = arg_dict['app_id']
 self.post_url = arg_dict['url']
 self.kind = arg_dict['kind']
 self.batch_size = arg_dict['batch_size']
 self.input_generator_factory = input_generator_factory
 self.num_threads = arg_dict['num_threads']
 self.email = arg_dict['email']
 self.passin = arg_dict['passin']
+self.dry_run = arg_dict['dry_run']
 self.throttle = throttle
 self.progress_db = progress_db
-self.workerthread_factory = workerthread_factory
 self.progresstrackerthread_factory = progresstrackerthread_factory
 self.max_queue_size = max_queue_size
 self.request_manager_factory = request_manager_factory
 self.datasourcethread_factory = datasourcethread_factory
-self.work_queue_factory = work_queue_factory
 self.progress_queue_factory = progress_queue_factory
+self.thread_pool_factory = thread_pool_factory
 (scheme,
 self.host_port, self.url_path,
 unused_query, unused_fragment) = urlparse.urlsplit(self.post_url)
 self.secure = (scheme == 'https')
 AuthenticationError: If authentication is required and fails.
 Returns:
 Error code suitable for sys.exit, e.g. 0 on success, 1 on failure.
 """
-thread_gate = ThreadGate(True)
+self.error = False
+thread_pool = self.thread_pool_factory(
+self.num_threads, queue_size=self.max_queue_size)
 self.throttle.Register(threading.currentThread())
 threading.currentThread().exit_flag = False
-work_queue = self.work_queue_factory(self.max_queue_size)
 progress_queue = self.progress_queue_factory(self.max_queue_size)
 request_manager = self.request_manager_factory(self.app_id,
 self.host_port,
 self.url_path,
 self.kind,
 self.throttle,
 self.batch_size,
 self.secure,
 self.email,
-self.passin)
+self.passin,
+self.dry_run)
 try:
 request_manager.Authenticate()
 except Exception, e:
+self.error = True
 if not isinstance(e, urllib2.HTTPError) or (
 e.code != 302 and e.code != 401):
 logger.exception('Exception during authentication')
 raise AuthenticationError()
 if (request_manager.auth_called and
 not request_manager.authenticated):
+self.error = True
 raise AuthenticationError('Authentication failed')
-for unused_idx in xrange(self.num_threads):
+for thread in thread_pool.Threads():
-thread = self.workerthread_factory(work_queue,
-self.throttle,
-thread_gate,
-request_manager,
-self.num_threads,
-self.batch_size)
 self.throttle.Register(thread)
-thread_gate.Register(thread)
 self.progress_thread = self.progresstrackerthread_factory(
 progress_queue, self.progress_db)
 if self.progress_db.UseProgressData():
 progress_generator_factory = self.progress_db.GetProgressStatusGenerator
 else:
 progress_generator_factory = None
 self.data_source_thread = (
-self.datasourcethread_factory(work_queue,
+self.datasourcethread_factory(request_manager,
+thread_pool,
 progress_queue,
 self.input_generator_factory,
 progress_generator_factory))
 thread_local = threading.local()
 thread_local.shut_down = False
 def Interrupt(unused_signum, unused_frame):
 """Shutdown gracefully in response to a signal."""
 thread_local.shut_down = True
+self.error = True
 signal.signal(signal.SIGINT, Interrupt)
 self.progress_thread.start()
 self.data_source_thread.start()
-for thread in thread_gate.Threads():
-thread.start()
 while not thread_local.shut_down:
 self.data_source_thread.join(timeout=0.25)
 if self.data_source_thread.isAlive():
-for thread in list(thread_gate.Threads()) + [self.progress_thread]:
+for thread in list(thread_pool.Threads()) + [self.progress_thread]:
 if not thread.isAlive():
 logger.info('Unexpected thread death: %s', thread.getName())
 thread_local.shut_down = True
+self.error = True
 break
 else:
 break
-if thread_local.shut_down:
-ShutdownThreads(self.data_source_thread, work_queue, thread_gate)
 def _Join(ob, msg):
 logger.debug('Waiting for %s...', msg)
 if isinstance(ob, threading.Thread):
 ob.join(timeout=3.0)
 if ob.isAlive():
-logger.debug('Joining %s failed', ob.GetFriendlyName())
+logger.debug('Joining %s failed', ob)
 else:
 logger.debug('... done.')
 elif isinstance(ob, (Queue.Queue, ReQueue)):
-if not InterruptibleQueueJoin(ob, thread_local, thread_gate):
+if not InterruptibleQueueJoin(ob, thread_local, thread_pool):
-ShutdownThreads(self.data_source_thread, work_queue, thread_gate)
+ShutdownThreads(self.data_source_thread, thread_pool)
 else:
 ob.join()
 logger.debug('... done.')
-_Join(work_queue, 'work_queue to flush')
+if self.data_source_thread.error or thread_local.shut_down:
+ShutdownThreads(self.data_source_thread, thread_pool)
-for unused_thread in thread_gate.Threads():
+else:
-work_queue.put(_THREAD_SHOULD_EXIT)
+_Join(thread_pool.requeue, 'worker threads to finish')
-for unused_thread in thread_gate.Threads():
+thread_pool.Shutdown()
-thread_gate.EnableThread()
+thread_pool.JoinThreads()
+thread_pool.CheckErrors()
-for thread in thread_gate.Threads():
+print ''
-_Join(thread, 'thread [%s] to terminate' % thread.getName())
-thread.CheckError()
 if self.progress_thread.isAlive():
-InterruptibleQueueJoin(progress_queue, thread_local, thread_gate,
+InterruptibleQueueJoin(progress_queue, thread_local, thread_pool,
 check_workers=False)
 else:
 logger.warn('Progress thread exited prematurely')
 progress_queue.put(_THREAD_SHOULD_EXIT)
 def __init__(self, *args, **kwargs):
 BulkTransporterApp.__init__(self, *args, **kwargs)
 def ReportStatus(self):
 """Display a message reporting the final status of the transfer."""
-total_up, duration = self.throttle.TotalTransferred(BANDWIDTH_UP)
+total_up, duration = self.throttle.TotalTransferred(
+remote_api_throttle.BANDWIDTH_UP)
 s_total_up, unused_duration = self.throttle.TotalTransferred(
-HTTPS_BANDWIDTH_UP)
+remote_api_throttle.HTTPS_BANDWIDTH_UP)
 total_up += s_total_up
 total = total_up
 logger.info('%d entites total, %d previously transferred',
 self.data_source_thread.read_count,
 self.data_source_thread.xfer_count)
 def __init__(self, *args, **kwargs):
 BulkTransporterApp.__init__(self, *args, **kwargs)
 def ReportStatus(self):
 """Display a message reporting the final status of the transfer."""
-total_down, duration = self.throttle.TotalTransferred(BANDWIDTH_DOWN)
+total_down, duration = self.throttle.TotalTransferred(
+remote_api_throttle.BANDWIDTH_DOWN)
 s_total_down, unused_duration = self.throttle.TotalTransferred(
-HTTPS_BANDWIDTH_DOWN)
+remote_api_throttle.HTTPS_BANDWIDTH_DOWN)
 total_down += s_total_down
 total = total_down
 existing_count = self.progress_thread.existing_count
 xfer_count = self.progress_thread.EntitiesTransferred()
 logger.info('Have %d entities, %d previously transferred',
-xfer_count + existing_count, existing_count)
+xfer_count, existing_count)
 logger.info('%d entities (%d bytes) transferred in %.1f seconds',
 xfer_count, total, duration)
-return 0
+if self.error:
+return 1
+else:
+return 0
+class BulkMapperApp(BulkTransporterApp):
+"""Class to encapsulate bulk map functionality."""
+def __init__(self, *args, **kwargs):
+BulkTransporterApp.__init__(self, *args, **kwargs)
+def ReportStatus(self):
+"""Display a message reporting the final status of the transfer."""
+total_down, duration = self.throttle.TotalTransferred(
+remote_api_throttle.BANDWIDTH_DOWN)
+s_total_down, unused_duration = self.throttle.TotalTransferred(
+remote_api_throttle.HTTPS_BANDWIDTH_DOWN)
+total_down += s_total_down
+total = total_down
+xfer_count = self.progress_thread.EntitiesTransferred()
+logger.info('The following may be inaccurate if any mapper tasks '
+'encountered errors and had to be retried.')
+logger.info('Applied mapper to %s entities.',
+xfer_count)
+logger.info('%s entities (%s bytes) transferred in %.1f seconds',
+xfer_count, total, duration)
+if self.error:
+return 1
+else:
+return 0
 def PrintUsageExit(code):
 """Prints usage information and exits with a status code.
 'result_db_filename=',
 'download',
 'loader_opts=',
 'exporter_opts=',
 'log_file=',
+'mapper_opts=',
 'email=',
 'passin',
+'map',
+'dry_run',
+'dump',
+'restore',
 ]
-def ParseArguments(argv):
+def ParseArguments(argv, die_fn=lambda: PrintUsageExit(1)):
 """Parses command-line arguments.
 Prints out a help message if -h or --help is supplied.
 Args:
 argv: List of command-line arguments.
+die_fn: Function to invoke to end the program.
 Returns:
 A dictionary containing the value of command-line options.
 """
 opts, unused_args = getopt.getopt(
 FLAG_SPEC)
 arg_dict = {}
 arg_dict['url'] = REQUIRED_OPTION
-arg_dict['filename'] = REQUIRED_OPTION
+arg_dict['filename'] = None
-arg_dict['config_file'] = REQUIRED_OPTION
+arg_dict['config_file'] = None
-arg_dict['kind'] = REQUIRED_OPTION
+arg_dict['kind'] = None
-arg_dict['batch_size'] = DEFAULT_BATCH_SIZE
+arg_dict['batch_size'] = None
 arg_dict['num_threads'] = DEFAULT_THREAD_COUNT
 arg_dict['bandwidth_limit'] = DEFAULT_BANDWIDTH_LIMIT
 arg_dict['rps_limit'] = DEFAULT_RPS_LIMIT
 arg_dict['http_limit'] = DEFAULT_REQUEST_LIMIT
 arg_dict['exporter_opts'] = None
 arg_dict['debug'] = False
 arg_dict['log_file'] = None
 arg_dict['email'] = None
 arg_dict['passin'] = False
+arg_dict['mapper_opts'] = None
+arg_dict['map'] = False
+arg_dict['dry_run'] = False
+arg_dict['dump'] = False
+arg_dict['restore'] = False
 def ExpandFilename(filename):
 """Expand shell variables and ~usernames in filename."""
 return os.path.expandvars(os.path.expanduser(filename))
 elif option == '--loader_opts':
 arg_dict['loader_opts'] = value
 elif option == '--exporter_opts':
 arg_dict['exporter_opts'] = value
 elif option == '--log_file':
-arg_dict['log_file'] = value
+arg_dict['log_file'] = ExpandFilename(value)
 elif option == '--email':
 arg_dict['email'] = value
 elif option == '--passin':
 arg_dict['passin'] = True
+elif option == '--map':
-return ProcessArguments(arg_dict, die_fn=lambda: PrintUsageExit(1))
+arg_dict['map'] = True
+elif option == '--mapper_opts':
+arg_dict['mapper_opts'] = value
+elif option == '--dry_run':
+arg_dict['dry_run'] = True
+elif option == '--dump':
+arg_dict['dump'] = True
+elif option == '--restore':
+arg_dict['restore'] = True
+return ProcessArguments(arg_dict, die_fn=die_fn)
 def ThrottleLayout(bandwidth_limit, http_limit, rps_limit):
 """Return a dictionary indicating the throttle options."""
-return {
+bulkloader_limits = dict(remote_api_throttle.NO_LIMITS)
-BANDWIDTH_UP: bandwidth_limit,
+bulkloader_limits.update({
-BANDWIDTH_DOWN: bandwidth_limit,
+remote_api_throttle.BANDWIDTH_UP: bandwidth_limit,
-REQUESTS: http_limit,
+remote_api_throttle.BANDWIDTH_DOWN: bandwidth_limit,
-HTTPS_BANDWIDTH_UP: bandwidth_limit / 5,
+remote_api_throttle.REQUESTS: http_limit,
-HTTPS_BANDWIDTH_DOWN: bandwidth_limit / 5,
+remote_api_throttle.HTTPS_BANDWIDTH_UP: bandwidth_limit,
-HTTPS_REQUESTS: http_limit / 5,
+remote_api_throttle.HTTPS_BANDWIDTH_DOWN: bandwidth_limit,
-RECORDS: rps_limit,
+remote_api_throttle.HTTPS_REQUESTS: http_limit,
-}
+remote_api_throttle.ENTITIES_FETCHED: rps_limit,
+remote_api_throttle.ENTITIES_MODIFIED: rps_limit,
+})
+return bulkloader_limits
 def CheckOutputFile(filename):
 """Check that the given file does not exist and can be opened for writing.
 filename: The name of the file.
 Raises:
 FileExistsError: if the given filename is not found
 FileNotWritableError: if the given filename is not readable.
 """
-if os.path.exists(filename):
+full_path = os.path.abspath(filename)
+if os.path.exists(full_path):
 raise FileExistsError('%s: output file exists' % filename)
-elif not os.access(os.path.dirname(filename), os.W_OK):
+elif not os.access(os.path.dirname(full_path), os.W_OK):
 raise FileNotWritableError(
-'%s: not writable' % os.path.dirname(filename))
+'%s: not writable' % os.path.dirname(full_path))
 def LoadConfig(config_file_name, exit_fn=sys.exit):
 """Loads a config file and registers any Loader classes present.
 Loader.RegisterLoader(cls())
 if hasattr(bulkloader_config, 'exporters'):
 for cls in bulkloader_config.exporters:
 Exporter.RegisterExporter(cls())
+if hasattr(bulkloader_config, 'mappers'):
+for cls in bulkloader_config.mappers:
+Mapper.RegisterMapper(cls())
 except NameError, e:
 m = re.search(r"[^']*'([^']*)'.*", str(e))
 if m.groups() and m.group(1) == 'Loader':
 print >>sys.stderr, """
 The config file format has changed and you appear to be using an old-style
 def _MakeSignature(app_id=None,
 url=None,
 kind=None,
 db_filename=None,
+perform_map=None,
 download=None,
 has_header=None,
-result_db_filename=None):
+result_db_filename=None,
+dump=None,
+restore=None):
 """Returns a string that identifies the important options for the database."""
 if download:
 result_db_line = 'result_db: %s' % result_db_filename
 else:
 result_db_line = ''
 return u"""
 app_id: %s
 url: %s
 kind: %s
 download: %s
+map: %s
+dump: %s
+restore: %s
 progress_db: %s
 has_header: %s
 %s
-""" % (app_id, url, kind, download, db_filename, has_header, result_db_line)
+""" % (app_id, url, kind, download, perform_map, dump, restore, db_filename,
+has_header, result_db_line)
 def ProcessArguments(arg_dict,
 die_fn=lambda: sys.exit(1)):
 """Processes non command-line input arguments.
 Returns:
 A dictionary of bulkloader options.
 """
 app_id = GetArgument(arg_dict, 'app_id', die_fn)
 url = GetArgument(arg_dict, 'url', die_fn)
+dump = GetArgument(arg_dict, 'dump', die_fn)
+restore = GetArgument(arg_dict, 'restore', die_fn)
 filename = GetArgument(arg_dict, 'filename', die_fn)
 batch_size = GetArgument(arg_dict, 'batch_size', die_fn)
 kind = GetArgument(arg_dict, 'kind', die_fn)
 db_filename = GetArgument(arg_dict, 'db_filename', die_fn)
 config_file = GetArgument(arg_dict, 'config_file', die_fn)
 result_db_filename = GetArgument(arg_dict, 'result_db_filename', die_fn)
 download = GetArgument(arg_dict, 'download', die_fn)
 log_file = GetArgument(arg_dict, 'log_file', die_fn)
+perform_map = GetArgument(arg_dict, 'map', die_fn)
-unused_passin = GetArgument(arg_dict, 'passin', die_fn)
-unused_email = GetArgument(arg_dict, 'email', die_fn)
-unused_debug = GetArgument(arg_dict, 'debug', die_fn)
-unused_num_threads = GetArgument(arg_dict, 'num_threads', die_fn)
-unused_bandwidth_limit = GetArgument(arg_dict, 'bandwidth_limit', die_fn)
-unused_rps_limit = GetArgument(arg_dict, 'rps_limit', die_fn)
-unused_http_limit = GetArgument(arg_dict, 'http_limit', die_fn)
-unused_auth_domain = GetArgument(arg_dict, 'auth_domain', die_fn)
-unused_has_headers = GetArgument(arg_dict, 'has_header', die_fn)
-unused_loader_opts = GetArgument(arg_dict, 'loader_opts', die_fn)
-unused_exporter_opts = GetArgument(arg_dict, 'exporter_opts', die_fn)
 errors = []
+if batch_size is None:
+if download or perform_map:
+arg_dict['batch_size'] = DEFAULT_DOWNLOAD_BATCH_SIZE
+else:
+arg_dict['batch_size'] = DEFAULT_BATCH_SIZE
+elif batch_size <= 0:
+errors.append('batch_size must be at least 1')
 if db_filename is None:
 arg_dict['db_filename'] = time.strftime(
 'bulkloader-progress-%Y%m%d.%H%M%S.sql3')
 'bulkloader-results-%Y%m%d.%H%M%S.sql3')
 if log_file is None:
 arg_dict['log_file'] = time.strftime('bulkloader-log-%Y%m%d.%H%M%S')
-if batch_size <= 0:
-errors.append('batch_size must be at least 1')
 required = '%s argument required'
+if config_file is None and not dump and not restore:
+errors.append('One of --config_file, --dump, or --restore is required')
 if url is REQUIRED_OPTION:
 errors.append(required % 'url')
-if filename is REQUIRED_OPTION:
+if not filename and not perform_map:
 errors.append(required % 'filename')
-if kind is REQUIRED_OPTION:
+if kind is None:
-errors.append(required % 'kind')
+if download or map:
+errors.append('kind argument required for this operation')
-if config_file is REQUIRED_OPTION:
+elif not dump and not restore:
-errors.append(required % 'config_file')
+errors.append(
+'kind argument required unless --dump or --restore is specified')
-if download:
-if result_db_filename is REQUIRED_OPTION:
-errors.append(required % 'result_db_filename')
 if not app_id:
-(unused_scheme, host_port, unused_url_path,
+if url and url is not REQUIRED_OPTION:
-unused_query, unused_fragment) = urlparse.urlsplit(url)
+(unused_scheme, host_port, unused_url_path,
-suffix_idx = host_port.find('.appspot.com')
+unused_query, unused_fragment) = urlparse.urlsplit(url)
-if suffix_idx > -1:
+suffix_idx = host_port.find('.appspot.com')
-arg_dict['app_id'] = host_port[:suffix_idx]
+if suffix_idx > -1:
-elif host_port.split(':')[0].endswith('google.com'):
+arg_dict['app_id'] = host_port[:suffix_idx]
-arg_dict['app_id'] = host_port.split('.')[0]
+elif host_port.split(':')[0].endswith('google.com'):
-else:
+arg_dict['app_id'] = host_port.split('.')[0]
-errors.append('app_id argument required for non appspot.com domains')
+else:
+errors.append('app_id argument required for non appspot.com domains')
 if errors:
 print >>sys.stderr, '\n'.join(errors)
 die_fn()
 has_header = arg_dict['has_header']
 download = arg_dict['download']
 result_db_filename = arg_dict['result_db_filename']
 loader_opts = arg_dict['loader_opts']
 exporter_opts = arg_dict['exporter_opts']
+mapper_opts = arg_dict['mapper_opts']
 email = arg_dict['email']
 passin = arg_dict['passin']
+perform_map = arg_dict['map']
+dump = arg_dict['dump']
+restore = arg_dict['restore']
 os.environ['AUTH_DOMAIN'] = auth_domain
 kind = ParseKind(kind)
-check_file(config_file)
+if not dump and not restore:
-if not download:
+check_file(config_file)
+if download and perform_map:
+logger.error('--download and --map are mutually exclusive.')
+if download or dump:
+check_output_file(filename)
+elif not perform_map:
 check_file(filename)
+if dump:
+Exporter.RegisterExporter(DumpExporter(kind, result_db_filename))
+elif restore:
+Loader.RegisterLoader(RestoreLoader(kind))
 else:
-check_output_file(filename)
+LoadConfig(config_file)
-LoadConfig(config_file)
 os.environ['APPLICATION_ID'] = app_id
 throttle_layout = ThrottleLayout(bandwidth_limit, http_limit, rps_limit)
+logger.info('Throttling transfers:')
-throttle = Throttle(layout=throttle_layout)
+logger.info('Bandwidth: %s bytes/second', bandwidth_limit)
+logger.info('HTTP connections: %s/second', http_limit)
+logger.info('Entities inserted/fetched/modified: %s/second', rps_limit)
+throttle = remote_api_throttle.Throttle(layout=throttle_layout)
 signature = _MakeSignature(app_id=app_id,
 url=url,
 kind=kind,
 db_filename=db_filename,
 download=download,
+perform_map=perform_map,
 has_header=has_header,
-result_db_filename=result_db_filename)
+result_db_filename=result_db_filename,
+dump=dump,
+restore=restore)
 max_queue_size = max(DEFAULT_QUEUE_SIZE, 3 * num_threads + 5)
 if db_filename == 'skip':
 progress_db = StubProgressDatabase()
-elif not download:
+elif not download and not perform_map and not dump:
 progress_db = ProgressDatabase(db_filename, signature)
 else:
 progress_db = ExportProgressDatabase(db_filename, signature)
-if download:
-result_db = ResultDatabase(result_db_filename, signature)
 return_code = 1
-if not download:
+if not download and not perform_map and not dump:
 loader = Loader.RegisteredLoader(kind)
 try:
 loader.initialize(filename, loader_opts)
 workitem_generator_factory = GetCSVGeneratorFactory(
 kind, filename, batch_size, has_header)
 app = BulkUploaderApp(arg_dict,
 workitem_generator_factory,
 throttle,
 progress_db,
-BulkLoaderThread,
 ProgressTrackerThread,
 max_queue_size,
 RequestManager,
 DataSourceThread,
-ReQueue,
 Queue.Queue)
 try:
 return_code = app.Run()
 except AuthenticationError:
 logger.info('Authentication Failed')
 finally:
 loader.finalize()
-else:
+elif not perform_map:
+result_db = ResultDatabase(result_db_filename, signature)
 exporter = Exporter.RegisteredExporter(kind)
 try:
 exporter.initialize(filename, exporter_opts)
-def KeyRangeGeneratorFactory(progress_queue, progress_gen):
+def KeyRangeGeneratorFactory(request_manager, progress_queue,
-return KeyRangeGenerator(kind, progress_queue, progress_gen)
+progress_gen):
+return KeyRangeItemGenerator(request_manager, kind, progress_queue,
+progress_gen, DownloadItem)
 def ExportProgressThreadFactory(progress_queue, progress_db):
 return ExportProgressThread(kind,
 progress_queue,
 progress_db,
 result_db)
 app = BulkDownloaderApp(arg_dict,
 KeyRangeGeneratorFactory,
 throttle,
 progress_db,
-BulkExporterThread,
 ExportProgressThreadFactory,
 0,
 RequestManager,
 DataSourceThread,
-ReQueue,
 Queue.Queue)
 try:
 return_code = app.Run()
 except AuthenticationError:
 logger.info('Authentication Failed')
 finally:
 exporter.finalize()
+elif not download:
+mapper = Mapper.RegisteredMapper(kind)
+try:
+mapper.initialize(mapper_opts)
+def KeyRangeGeneratorFactory(request_manager, progress_queue,
+progress_gen):
+return KeyRangeItemGenerator(request_manager, kind, progress_queue,
+progress_gen, MapperItem)
+def MapperProgressThreadFactory(progress_queue, progress_db):
+return MapperProgressThread(kind,
+progress_queue,
+progress_db)
+app = BulkMapperApp(arg_dict,
+KeyRangeGeneratorFactory,
+throttle,
+progress_db,
+MapperProgressThreadFactory,
+0,
+RequestManager,
+DataSourceThread,
+Queue.Queue)
+try:
+return_code = app.Run()
+except AuthenticationError:
+logger.info('Authentication Failed')
+finally:
+mapper.finalize()
 return return_code
 def SetupLogging(arg_dict):
 """Sets up logging for the bulkloader.
 console.setFormatter(formatter)
 logger.addHandler(console)
 logger.info('Logging to %s', log_file)
+remote_api_throttle.logger.setLevel(level)
+remote_api_throttle.logger.addHandler(file_handler)
+remote_api_throttle.logger.addHandler(console)
 appengine_rpc.logger.setLevel(logging.WARN)
+adaptive_thread_pool.logger.setLevel(logging.DEBUG)
+adaptive_thread_pool.logger.addHandler(console)
+adaptive_thread_pool.logger.addHandler(file_handler)
+adaptive_thread_pool.logger.propagate = False
 def Run(arg_dict):
 """Sets up and runs the bulkloader, given the options as keyword arguments.

changeset 2864	2e0b0af889be
parent 2413	d0b7dac5325c
child 3031	7678f72140e6