thirdparty/google_appengine/google/appengine/ext/search/__init__.py
changeset 109 620f9b141567
child 149 f2e327a7c5de
equal deleted inserted replaced
108:261778de26ff 109:620f9b141567
       
     1 #!/usr/bin/env python
       
     2 #
       
     3 # Copyright 2007 Google Inc.
       
     4 #
       
     5 # Licensed under the Apache License, Version 2.0 (the "License");
       
     6 # you may not use this file except in compliance with the License.
       
     7 # You may obtain a copy of the License at
       
     8 #
       
     9 #     http://www.apache.org/licenses/LICENSE-2.0
       
    10 #
       
    11 # Unless required by applicable law or agreed to in writing, software
       
    12 # distributed under the License is distributed on an "AS IS" BASIS,
       
    13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
       
    14 # See the License for the specific language governing permissions and
       
    15 # limitations under the License.
       
    16 #
       
    17 
       
    18 """Full text indexing and search, implemented in pure python.
       
    19 
       
    20 Defines a SearchableModel subclass of db.Model that supports full text
       
    21 indexing and search, based on the datastore's existing indexes.
       
    22 
       
    23 Don't expect too much. First, there's no ranking, which is a killer drawback.
       
    24 There's also no exact phrase match, substring match, boolean operators,
       
    25 stemming, or other common full text search features. Finally, support for stop
       
    26 words (common words that are not indexed) is currently limited to English.
       
    27 
       
    28 To be indexed, entities must be created and saved as SearchableModel
       
    29 instances, e.g.:
       
    30 
       
    31   class Article(search.SearchableModel):
       
    32     text = db.TextProperty()
       
    33     ...
       
    34 
       
    35   article = Article(text=...)
       
    36   article.save()
       
    37 
       
    38 To search the full text index, use the SearchableModel.all() method to get an
       
    39 instance of SearchableModel.Query, which subclasses db.Query. Use its search()
       
    40 method to provide a search query, in addition to any other filters or sort
       
    41 orders, e.g.:
       
    42 
       
    43   query = article.all().search('a search query').filter(...).order(...)
       
    44   for result in query:
       
    45     ...
       
    46 
       
    47 The full text index is stored in a property named __searchable_text_index.
       
    48 
       
    49 
       
    50 In general, if you just want to provide full text search, you *don't* need to
       
    51 add any extra indexes to your index.yaml. However, if you want to use search()
       
    52 in a query *in addition to* an ancestor, filter, or sort order, you'll need to
       
    53 create an index in index.yaml with the __searchable_text_index property. For
       
    54 example:
       
    55 
       
    56   - kind: Article
       
    57     properties:
       
    58     - name: __searchable_text_index
       
    59     - name: date
       
    60       direction: desc
       
    61     ...
       
    62 
       
    63 Note that using SearchableModel will noticeable increase the latency of save()
       
    64 operations, since it writes an index row for each indexable word. This also
       
    65 means that the latency of save() will increase roughly with the size of the
       
    66 properties in a given entity. Caveat hacker!
       
    67 """
       
    68 
       
    69 
       
    70 
       
    71 
       
    72 import re
       
    73 import string
       
    74 import sys
       
    75 
       
    76 from google.appengine.api import datastore
       
    77 from google.appengine.api import datastore_errors
       
    78 from google.appengine.api import datastore_types
       
    79 from google.appengine.ext import db
       
    80 from google.appengine.datastore import datastore_pb
       
    81 
       
    82 class SearchableEntity(datastore.Entity):
       
    83   """A subclass of datastore.Entity that supports full text indexing.
       
    84 
       
    85   Automatically indexes all string and Text properties, using the datastore's
       
    86   built-in per-property indices. To search, use the SearchableQuery class and
       
    87   its Search() method.
       
    88   """
       
    89   _FULL_TEXT_INDEX_PROPERTY = '__searchable_text_index'
       
    90 
       
    91   _FULL_TEXT_MIN_LENGTH = 3
       
    92 
       
    93   _FULL_TEXT_STOP_WORDS = frozenset([
       
    94    'a', 'about', 'according', 'accordingly', 'affected', 'affecting', 'after',
       
    95    'again', 'against', 'all', 'almost', 'already', 'also', 'although',
       
    96    'always', 'am', 'among', 'an', 'and', 'any', 'anyone', 'apparently', 'are',
       
    97    'arise', 'as', 'aside', 'at', 'away', 'be', 'became', 'because', 'become',
       
    98    'becomes', 'been', 'before', 'being', 'between', 'both', 'briefly', 'but',
       
    99    'by', 'came', 'can', 'cannot', 'certain', 'certainly', 'could', 'did', 'do',
       
   100    'does', 'done', 'during', 'each', 'either', 'else', 'etc', 'ever', 'every',
       
   101    'following', 'for', 'found', 'from', 'further', 'gave', 'gets', 'give',
       
   102    'given', 'giving', 'gone', 'got', 'had', 'hardly', 'has', 'have', 'having',
       
   103    'here', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'itself',
       
   104    'just', 'keep', 'kept', 'knowledge', 'largely', 'like', 'made', 'mainly',
       
   105    'make', 'many', 'might', 'more', 'most', 'mostly', 'much', 'must', 'nearly',
       
   106    'necessarily', 'neither', 'next', 'no', 'none', 'nor', 'normally', 'not',
       
   107    'noted', 'now', 'obtain', 'obtained', 'of', 'often', 'on', 'only', 'or',
       
   108    'other', 'our', 'out', 'owing', 'particularly', 'past', 'perhaps', 'please',
       
   109    'poorly', 'possible', 'possibly', 'potentially', 'predominantly', 'present',
       
   110    'previously', 'primarily', 'probably', 'prompt', 'promptly', 'put',
       
   111    'quickly', 'quite', 'rather', 'readily', 'really', 'recently', 'regarding',
       
   112    'regardless', 'relatively', 'respectively', 'resulted', 'resulting',
       
   113    'results', 'said', 'same', 'seem', 'seen', 'several', 'shall', 'should',
       
   114    'show', 'showed', 'shown', 'shows', 'significantly', 'similar', 'similarly',
       
   115    'since', 'slightly', 'so', 'some', 'sometime', 'somewhat', 'soon',
       
   116    'specifically', 'state', 'states', 'strongly', 'substantially',
       
   117    'successfully', 'such', 'sufficiently', 'than', 'that', 'the', 'their',
       
   118    'theirs', 'them', 'then', 'there', 'therefore', 'these', 'they', 'this',
       
   119    'those', 'though', 'through', 'throughout', 'to', 'too', 'toward', 'under',
       
   120    'unless', 'until', 'up', 'upon', 'use', 'used', 'usefully', 'usefulness',
       
   121    'using', 'usually', 'various', 'very', 'was', 'we', 'were', 'what', 'when',
       
   122    'where', 'whether', 'which', 'while', 'who', 'whose', 'why', 'widely',
       
   123    'will', 'with', 'within', 'without', 'would', 'yet', 'you'])
       
   124 
       
   125   _PUNCTUATION_REGEX = re.compile('[' + re.escape(string.punctuation) + ']')
       
   126 
       
   127   def __init__(self, kind_or_entity, *args, **kwargs):
       
   128     """Constructor. May be called as a copy constructor.
       
   129 
       
   130     If kind_or_entity is a datastore.Entity, copies it into this Entity.
       
   131     datastore.Get() and Query() returns instances of datastore.Entity, so this
       
   132     is useful for converting them back to SearchableEntity so that they'll be
       
   133     indexed when they're stored back in the datastore.
       
   134 
       
   135     Otherwise, passes through the positional and keyword args to the
       
   136     datastore.Entity constructor.
       
   137 
       
   138     Args:
       
   139       kind_or_entity: string or datastore.Entity
       
   140     """
       
   141     if isinstance(kind_or_entity, datastore.Entity):
       
   142       self._Entity__key = kind_or_entity._Entity__key
       
   143       self.update(kind_or_entity)
       
   144     else:
       
   145       super(SearchableEntity, self).__init__(kind_or_entity, *args, **kwargs)
       
   146 
       
   147   def _ToPb(self):
       
   148     """Rebuilds the full text index, then delegates to the superclass.
       
   149 
       
   150     Returns:
       
   151       entity_pb.Entity
       
   152     """
       
   153     if SearchableEntity._FULL_TEXT_INDEX_PROPERTY in self:
       
   154       del self[SearchableEntity._FULL_TEXT_INDEX_PROPERTY]
       
   155 
       
   156     index = set()
       
   157     for (name, values) in self.items():
       
   158       if not isinstance(values, list):
       
   159         values = [values]
       
   160       if (isinstance(values[0], basestring) and
       
   161           not isinstance(values[0], datastore_types.Blob)):
       
   162         for value in values:
       
   163           index.update(SearchableEntity._FullTextIndex(value))
       
   164 
       
   165     index_list = list(index)
       
   166     if index_list:
       
   167       self[SearchableEntity._FULL_TEXT_INDEX_PROPERTY] = index_list
       
   168 
       
   169     return super(SearchableEntity, self)._ToPb()
       
   170 
       
   171   @classmethod
       
   172   def _FullTextIndex(cls, text):
       
   173     """Returns a set of keywords appropriate for full text indexing.
       
   174 
       
   175     See SearchableQuery.Search() for details.
       
   176 
       
   177     Args:
       
   178       text: string
       
   179 
       
   180     Returns:
       
   181       set of strings
       
   182     """
       
   183 
       
   184     if text:
       
   185       datastore_types.ValidateString(text, 'text', max_len=sys.maxint)
       
   186       text = cls._PUNCTUATION_REGEX.sub(' ', text)
       
   187       words = text.lower().split()
       
   188 
       
   189       words = set(words)
       
   190 
       
   191       words -= cls._FULL_TEXT_STOP_WORDS
       
   192       for word in list(words):
       
   193         if len(word) < cls._FULL_TEXT_MIN_LENGTH:
       
   194           words.remove(word)
       
   195 
       
   196     else:
       
   197       words = set()
       
   198 
       
   199     return words
       
   200 
       
   201 
       
   202 class SearchableQuery(datastore.Query):
       
   203   """A subclass of datastore.Query that supports full text search.
       
   204 
       
   205   Only searches over entities that were created and stored using the
       
   206   SearchableEntity or SearchableModel classes.
       
   207   """
       
   208 
       
   209   def Search(self, search_query):
       
   210     """Add a search query. This may be combined with filters.
       
   211 
       
   212     Note that keywords in the search query will be silently dropped if they
       
   213     are stop words or too short, ie if they wouldn't be indexed.
       
   214 
       
   215     Args:
       
   216      search_query: string
       
   217 
       
   218     Returns:
       
   219       # this query
       
   220       SearchableQuery
       
   221     """
       
   222     datastore_types.ValidateString(search_query, 'search query')
       
   223     self._search_query = search_query
       
   224     return self
       
   225 
       
   226   def _ToPb(self, limit=None, offset=None):
       
   227     """Adds filters for the search query, then delegates to the superclass.
       
   228 
       
   229     Raises BadFilterError if a filter on the index property already exists.
       
   230 
       
   231     Args:
       
   232       # an upper bound on the number of results returned by the query.
       
   233       limit: int
       
   234       # number of results that match the query to skip.  limit is applied
       
   235       # after the offset is fulfilled.
       
   236       offset: int
       
   237 
       
   238     Returns:
       
   239       datastore_pb.Query
       
   240     """
       
   241     if SearchableEntity._FULL_TEXT_INDEX_PROPERTY in self:
       
   242       raise datastore_errors.BadFilterError(
       
   243         '%s is a reserved name.' % SearchableEntity._FULL_TEXT_INDEX_PROPERTY)
       
   244 
       
   245     pb = super(SearchableQuery, self)._ToPb(limit=limit, offset=offset)
       
   246 
       
   247     if hasattr(self, '_search_query'):
       
   248       keywords = SearchableEntity._FullTextIndex(self._search_query)
       
   249       for keyword in keywords:
       
   250         filter = pb.add_filter()
       
   251         filter.set_op(datastore_pb.Query_Filter.EQUAL)
       
   252         prop = filter.add_property()
       
   253         prop.set_name(SearchableEntity._FULL_TEXT_INDEX_PROPERTY)
       
   254         prop.mutable_value().set_stringvalue(keyword)
       
   255 
       
   256     return pb
       
   257 
       
   258 
       
   259 class SearchableModel(db.Model):
       
   260   """A subclass of db.Model that supports full text search and indexing.
       
   261 
       
   262   Automatically indexes all string-based properties. To search, use the all()
       
   263   method to get a SearchableModel.Query, then use its search() method.
       
   264   """
       
   265 
       
   266   class Query(db.Query):
       
   267     """A subclass of db.Query that supports full text search."""
       
   268     _search_query = None
       
   269 
       
   270     def search(self, search_query):
       
   271       """Adds a full text search to this query.
       
   272 
       
   273       Args:
       
   274         search_query, a string containing the full text search query.
       
   275 
       
   276       Returns:
       
   277         self
       
   278       """
       
   279       self._search_query = search_query
       
   280       return self
       
   281 
       
   282     def _get_query(self):
       
   283       """Wraps db.Query._get_query() and injects SearchableQuery."""
       
   284       query = db.Query._get_query(self, _query_class=SearchableQuery)
       
   285       if self._search_query:
       
   286         query.Search(self._search_query)
       
   287       return query
       
   288 
       
   289   def _populate_internal_entity(self):
       
   290     """Wraps db.Model._populate_internal_entity() and injects
       
   291     SearchableEntity."""
       
   292     return db.Model._populate_internal_entity(self,
       
   293                                               _entity_class=SearchableEntity)
       
   294 
       
   295   @classmethod
       
   296   def from_entity(cls, entity):
       
   297     """Wraps db.Model.from_entity() and injects SearchableEntity."""
       
   298     if not isinstance(entity, SearchableEntity):
       
   299       entity = SearchableEntity(entity)
       
   300     return super(SearchableModel, cls).from_entity(entity)
       
   301 
       
   302   @classmethod
       
   303   def all(cls):
       
   304     """Returns a SearchableModel.Query for this kind."""
       
   305     return SearchableModel.Query(cls)