thirdparty/google_appengine/google/appengine/ext/bulkload/__init__.py
changeset 109 620f9b141567
child 149 f2e327a7c5de
equal deleted inserted replaced
108:261778de26ff 109:620f9b141567
       
     1 #!/usr/bin/env python
       
     2 #
       
     3 # Copyright 2007 Google Inc.
       
     4 #
       
     5 # Licensed under the Apache License, Version 2.0 (the "License");
       
     6 # you may not use this file except in compliance with the License.
       
     7 # You may obtain a copy of the License at
       
     8 #
       
     9 #     http://www.apache.org/licenses/LICENSE-2.0
       
    10 #
       
    11 # Unless required by applicable law or agreed to in writing, software
       
    12 # distributed under the License is distributed on an "AS IS" BASIS,
       
    13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
       
    14 # See the License for the specific language governing permissions and
       
    15 # limitations under the License.
       
    16 #
       
    17 
       
    18 """A mix-in handler for bulk loading data into an application.
       
    19 
       
    20 For complete documentation, see the Tools and Libraries section of the
       
    21 documentation.
       
    22 
       
    23 To use this in your app, first write a script, e.g. bulkload.py, that
       
    24 instantiates a Loader for each entity kind you want to import and call
       
    25 bulkload.main(instance). For example:
       
    26 
       
    27 person = bulkload.Loader(
       
    28   'Person',
       
    29   [('name', str),
       
    30    ('email', datastore_types.Email),
       
    31    ('birthdate', lambda x: datetime.datetime.fromtimestamp(float(x))),
       
    32   ])
       
    33 
       
    34 if __name__ == '__main__':
       
    35   bulkload.main(person)
       
    36 
       
    37 See the Loader class for more information. Then, add a handler for it in your
       
    38 app.yaml, e.g.:
       
    39 
       
    40   urlmap:
       
    41   - regex: /load
       
    42     handler:
       
    43       type: 1
       
    44       path: bulkload.py
       
    45       requires_login: true
       
    46       admin_only: true
       
    47 
       
    48 Finally, deploy your app and run bulkload_client.py. For example, to load the
       
    49 file people.csv into a dev_appserver running on your local machine:
       
    50 
       
    51 ./bulkload_client.py --filename people.csv --kind Person --cookie ... \
       
    52                      --url http://localhost:8080/load
       
    53 
       
    54 The kind parameter is used to look up the Loader instance that will be used.
       
    55 The bulkload handler should usually be admin_only, so that non-admins can't use
       
    56 the shell to modify your app's data. The bulkload client uses the cookie
       
    57 parameter to piggyback its HTTP requests on your login session. A GET request
       
    58 to the URL specified for your bulkload script will give you a cookie parameter
       
    59 you can use (/load in the example above).  If your bulkload handler is not
       
    60 admin_only, you may omit the cookie parameter.
       
    61 
       
    62 If you want to do extra processing before the entities are stored, you can
       
    63 subclass Loader and override HandleEntity. HandleEntity is called once with
       
    64 each entity that is imported from the CSV data. You can return one or more
       
    65 entities from HandleEntity to be stored in its place, or None if nothing
       
    66 should be stored.
       
    67 
       
    68 For example, this loads calendar events and stores them as
       
    69 datastore_entities.Event entities. It also populates their author field with a
       
    70 reference to the corresponding datastore_entites.Contact entity. If no Contact
       
    71 entity exists yet for the given author, it creates one and stores it first.
       
    72 
       
    73 class EventLoader(bulkload.Loader):
       
    74   def __init__(self):
       
    75     EventLoader.__init__(self, 'Event',
       
    76                          [('title', str),
       
    77                           ('creator', str),
       
    78                           ('where', str),
       
    79                           ('startTime', lambda x:
       
    80                             datetime.datetime.fromtimestamp(float(x))),
       
    81                           ])
       
    82 
       
    83   def HandleEntity(self, entity):
       
    84     event = datastore_entities.Event(entity.title)
       
    85     event.update(entity)
       
    86 
       
    87     creator = event['creator']
       
    88     if creator:
       
    89       contact = datastore.Query('Contact', {'title': creator}).Get(1)
       
    90       if not contact:
       
    91         contact = [datastore_entities.Contact(creator)]
       
    92         datastore.Put(contact[0])
       
    93       event['author'] = contact[0].key()
       
    94 
       
    95     return event
       
    96 
       
    97 if __name__ == '__main__':
       
    98   bulkload.main(EventLoader())
       
    99 """
       
   100 
       
   101 
       
   102 
       
   103 
       
   104 
       
   105 import Cookie
       
   106 import StringIO
       
   107 import csv
       
   108 import httplib
       
   109 import os
       
   110 import sys
       
   111 import traceback
       
   112 import types
       
   113 
       
   114 
       
   115 import google
       
   116 import wsgiref.handlers
       
   117 
       
   118 from google.appengine.api import datastore
       
   119 from google.appengine.api import datastore_types
       
   120 from google.appengine.ext import webapp
       
   121 from google.appengine.ext.bulkload import constants
       
   122 
       
   123 
       
   124 def Validate(value, type):
       
   125   """ Checks that value is non-empty and of the right type.
       
   126 
       
   127   Raises ValueError if value is None or empty, TypeError if it's not the given
       
   128   type.
       
   129 
       
   130   Args:
       
   131     value: any value
       
   132     type: a type or tuple of types
       
   133   """
       
   134   if not value:
       
   135     raise ValueError('Value should not be empty; received %s.' % value)
       
   136   elif not isinstance(value, type):
       
   137     raise TypeError('Expected a %s, but received %s (a %s).' %
       
   138                     (type, value, value.__class__))
       
   139 
       
   140 
       
   141 class Loader(object):
       
   142   """ A base class for creating datastore entities from CSV input data.
       
   143 
       
   144   To add a handler for bulk loading a new entity kind into your datastore,
       
   145   write a subclass of this class that calls Loader.__init__ from your
       
   146   class's __init__.
       
   147 
       
   148   If you need to run extra code to convert entities from CSV, create new
       
   149   properties, or otherwise modify the entities before they're inserted,
       
   150   override HandleEntity.
       
   151   """
       
   152 
       
   153   __loaders = {}
       
   154   __kind = None
       
   155   __properties = None
       
   156 
       
   157   def __init__(self, kind, properties):
       
   158     """ Constructor.
       
   159 
       
   160     Populates this Loader's kind and properties map. Also registers it with
       
   161     the bulk loader, so that all you need to do is instantiate your Loader,
       
   162     and the bulkload handler will automatically use it.
       
   163 
       
   164     Args:
       
   165       kind: a string containing the entity kind that this loader handles
       
   166 
       
   167       properties: list of (name, converter) tuples.
       
   168 
       
   169       This is used to automatically convert the CSV columns into properties.
       
   170       The converter should be a function that takes one argument, a string
       
   171       value from the CSV file, and returns a correctly typed property value
       
   172       that should be inserted. The tuples in this list should match the
       
   173       columns in your CSV file, in order.
       
   174 
       
   175       For example:
       
   176         [('name', str),
       
   177          ('id_number', int),
       
   178          ('email', datastore_types.Email),
       
   179          ('user', users.User),
       
   180          ('birthdate', lambda x: datetime.datetime.fromtimestamp(float(x))),
       
   181          ('description', datastore_types.Text),
       
   182          ]
       
   183     """
       
   184     Validate(kind, basestring)
       
   185     self.__kind = kind
       
   186 
       
   187     Validate(properties, list)
       
   188     for name, fn in properties:
       
   189       Validate(name, basestring)
       
   190       assert callable(fn), (
       
   191         'Conversion function %s for property %s is not callable.' % (fn, name))
       
   192 
       
   193     self.__properties = properties
       
   194 
       
   195     Loader.__loaders[kind] = self
       
   196 
       
   197 
       
   198   def kind(self):
       
   199     """ Return the entity kind that this Loader handes.
       
   200     """
       
   201     return self.__kind
       
   202 
       
   203 
       
   204   def CreateEntity(self, values):
       
   205     """ Creates an entity from a list of property values.
       
   206 
       
   207     Args:
       
   208       values: list of str
       
   209 
       
   210     Returns:
       
   211       list of datastore.Entity
       
   212 
       
   213       The returned entities are populated with the property values from the
       
   214       argument, converted to native types using the properties map given in
       
   215       the constructor, and passed through HandleEntity. They're ready to be
       
   216       inserted.
       
   217 
       
   218     Raises an AssertionError if the number of values doesn't match the number
       
   219     of properties in the properties map.
       
   220     """
       
   221     Validate(values, list)
       
   222     assert len(values) == len(self.__properties), (
       
   223       'Expected %d CSV columns, found %d.' %
       
   224       (len(self.__properties), len(values)))
       
   225 
       
   226     entity = datastore.Entity(self.__kind)
       
   227     for (name, converter), val in zip(self.__properties, values):
       
   228       entity[name] = converter(val)
       
   229 
       
   230     entities = self.HandleEntity(entity)
       
   231 
       
   232     if entities is not None:
       
   233       if not isinstance(entities, list):
       
   234         entities = [entities]
       
   235 
       
   236       for entity in entities:
       
   237         if not isinstance(entity, datastore.Entity):
       
   238           raise TypeError('Expected a datastore.Entity, received %s (a %s).' %
       
   239                           (entity, entity.__class__))
       
   240 
       
   241     return entities
       
   242 
       
   243 
       
   244   def HandleEntity(self, entity):
       
   245     """ Subclasses can override this to add custom entity conversion code.
       
   246 
       
   247     This is called for each entity, after its properties are populated from
       
   248     CSV but before it is stored. Subclasses can override this to add custom
       
   249     entity handling code.
       
   250 
       
   251     The entity to be inserted should be returned. If multiple entities should
       
   252     be inserted, return a list of entities. If no entities should be inserted,
       
   253     return None or [].
       
   254 
       
   255     Args:
       
   256       entity: datastore.Entity
       
   257 
       
   258     Returns:
       
   259       datastore.Entity or list of datastore.Entity
       
   260     """
       
   261     return entity
       
   262 
       
   263 
       
   264   @staticmethod
       
   265   def RegisteredLoaders():
       
   266     """ Returns a list of the Loader instances that have been created.
       
   267     """
       
   268     return dict(Loader.__loaders)
       
   269 
       
   270 
       
   271 class BulkLoad(webapp.RequestHandler):
       
   272   """ A handler for bulk load requests.
       
   273   """
       
   274 
       
   275   def get(self):
       
   276     """ Handle a GET. Just show an info page.
       
   277     """
       
   278     page = self.InfoPage(self.request.uri)
       
   279     self.response.out.write(page)
       
   280 
       
   281 
       
   282   def post(self):
       
   283     """ Handle a POST. Reads CSV data, converts to entities, and stores them.
       
   284     """
       
   285     self.response.headers['Content-Type'] = 'text/plain'
       
   286     response, output = self.Load(self.request.get(constants.KIND_PARAM),
       
   287                                  self.request.get(constants.CSV_PARAM))
       
   288     self.response.set_status(response)
       
   289     self.response.out.write(output)
       
   290 
       
   291 
       
   292   def InfoPage(self, uri):
       
   293     """ Renders an information page with the POST endpoint and cookie flag.
       
   294 
       
   295     Args:
       
   296       uri: a string containing the request URI
       
   297     Returns:
       
   298       A string with the contents of the info page to be displayed
       
   299     """
       
   300     page = """
       
   301 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
       
   302  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
       
   303 <html><head>
       
   304 <title>Bulk Loader</title>
       
   305 </head><body>"""
       
   306 
       
   307     page += ('The bulk load endpoint is: <a href="%s">%s</a><br />\n' %
       
   308             (uri, uri))
       
   309 
       
   310     cookies = os.environ.get('HTTP_COOKIE', None)
       
   311     if cookies:
       
   312       cookie = Cookie.BaseCookie(cookies)
       
   313       for param in ['ACSID', 'dev_appserver_login']:
       
   314         value = cookie.get(param)
       
   315         if value:
       
   316           page += ("Pass this flag to the client: --cookie='%s=%s'\n" %
       
   317                    (param, value.value))
       
   318           break
       
   319 
       
   320     else:
       
   321       page += 'No cookie found!\n'
       
   322 
       
   323     page += '</body></html>'
       
   324     return page
       
   325 
       
   326 
       
   327   def Load(self, kind, data):
       
   328     """ Parses CSV data, uses a Loader to convert to entities, and stores them.
       
   329 
       
   330     On error, fails fast. Returns a "bad request" HTTP response code and
       
   331     includes the traceback in the output.
       
   332 
       
   333     Args:
       
   334       kind: a string containing the entity kind that this loader handles
       
   335       data: a string containing the CSV data to load
       
   336 
       
   337     Returns:
       
   338       tuple (response code, output) where:
       
   339         response code: integer HTTP response code to return
       
   340         output: string containing the HTTP response body
       
   341     """
       
   342     Validate(kind, basestring)
       
   343     Validate(data, basestring)
       
   344     output = []
       
   345 
       
   346     try:
       
   347       loader = Loader.RegisteredLoaders()[kind]
       
   348     except KeyError:
       
   349       output.append('Error: no Loader defined for kind %s.' % kind)
       
   350       return (httplib.BAD_REQUEST, ''.join(output))
       
   351 
       
   352     buffer = StringIO.StringIO(data)
       
   353     reader = csv.reader(buffer, skipinitialspace=True)
       
   354 
       
   355     try:
       
   356       csv.field_size_limit(800000)
       
   357     except AttributeError:
       
   358       pass
       
   359 
       
   360     entities = []
       
   361 
       
   362     line_num = 1
       
   363     for columns in reader:
       
   364       if columns:
       
   365         try:
       
   366           output.append('\nLoading from line %d...' % line_num)
       
   367           new_entities = loader.CreateEntity(columns)
       
   368           if new_entities:
       
   369             entities.extend(new_entities)
       
   370           output.append('done.')
       
   371         except:
       
   372           exc_info = sys.exc_info()
       
   373           stacktrace = traceback.format_exception(*exc_info)
       
   374           output.append('error:\n%s' % stacktrace)
       
   375           return (httplib.BAD_REQUEST, ''.join(output))
       
   376 
       
   377       line_num += 1
       
   378 
       
   379     for entity in entities:
       
   380       datastore.Put(entity)
       
   381 
       
   382     return (httplib.OK, ''.join(output))
       
   383 
       
   384 
       
   385 def main(*loaders):
       
   386   """Starts bulk upload.
       
   387 
       
   388   Raises TypeError if not, at least one Loader instance is given.
       
   389 
       
   390   Args:
       
   391     loaders: One or more Loader instance.
       
   392   """
       
   393   if not loaders:
       
   394     raise TypeError('Expected at least one argument.')
       
   395 
       
   396   for loader in loaders:
       
   397     if not isinstance(loader, Loader):
       
   398       raise TypeError('Expected a Loader instance; received %r' % loader)
       
   399 
       
   400   application = webapp.WSGIApplication([('.*', BulkLoad)])
       
   401   wsgiref.handlers.CGIHandler().run(application)
       
   402 
       
   403 if __name__ == '__main__':
       
   404   main()