|
1 #!/usr/bin/env python |
|
2 # |
|
3 # Copyright 2007 Google Inc. |
|
4 # |
|
5 # Licensed under the Apache License, Version 2.0 (the "License"); |
|
6 # you may not use this file except in compliance with the License. |
|
7 # You may obtain a copy of the License at |
|
8 # |
|
9 # http://www.apache.org/licenses/LICENSE-2.0 |
|
10 # |
|
11 # Unless required by applicable law or agreed to in writing, software |
|
12 # distributed under the License is distributed on an "AS IS" BASIS, |
|
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
14 # See the License for the specific language governing permissions and |
|
15 # limitations under the License. |
|
16 # |
|
17 |
|
18 """Full text indexing and search, implemented in pure python. |
|
19 |
|
20 Defines a SearchableModel subclass of db.Model that supports full text |
|
21 indexing and search, based on the datastore's existing indexes. |
|
22 |
|
23 Don't expect too much. First, there's no ranking, which is a killer drawback. |
|
24 There's also no exact phrase match, substring match, boolean operators, |
|
25 stemming, or other common full text search features. Finally, support for stop |
|
26 words (common words that are not indexed) is currently limited to English. |
|
27 |
|
28 To be indexed, entities must be created and saved as SearchableModel |
|
29 instances, e.g.: |
|
30 |
|
31 class Article(search.SearchableModel): |
|
32 text = db.TextProperty() |
|
33 ... |
|
34 |
|
35 article = Article(text=...) |
|
36 article.save() |
|
37 |
|
38 To search the full text index, use the SearchableModel.all() method to get an |
|
39 instance of SearchableModel.Query, which subclasses db.Query. Use its search() |
|
40 method to provide a search query, in addition to any other filters or sort |
|
41 orders, e.g.: |
|
42 |
|
43 query = article.all().search('a search query').filter(...).order(...) |
|
44 for result in query: |
|
45 ... |
|
46 |
|
47 The full text index is stored in a property named __searchable_text_index. |
|
48 |
|
49 |
|
50 In general, if you just want to provide full text search, you *don't* need to |
|
51 add any extra indexes to your index.yaml. However, if you want to use search() |
|
52 in a query *in addition to* an ancestor, filter, or sort order, you'll need to |
|
53 create an index in index.yaml with the __searchable_text_index property. For |
|
54 example: |
|
55 |
|
56 - kind: Article |
|
57 properties: |
|
58 - name: __searchable_text_index |
|
59 - name: date |
|
60 direction: desc |
|
61 ... |
|
62 |
|
63 Note that using SearchableModel will noticeable increase the latency of save() |
|
64 operations, since it writes an index row for each indexable word. This also |
|
65 means that the latency of save() will increase roughly with the size of the |
|
66 properties in a given entity. Caveat hacker! |
|
67 """ |
|
68 |
|
69 |
|
70 |
|
71 |
|
72 import re |
|
73 import string |
|
74 import sys |
|
75 |
|
76 from google.appengine.api import datastore |
|
77 from google.appengine.api import datastore_errors |
|
78 from google.appengine.api import datastore_types |
|
79 from google.appengine.ext import db |
|
80 from google.appengine.datastore import datastore_pb |
|
81 |
|
82 class SearchableEntity(datastore.Entity): |
|
83 """A subclass of datastore.Entity that supports full text indexing. |
|
84 |
|
85 Automatically indexes all string and Text properties, using the datastore's |
|
86 built-in per-property indices. To search, use the SearchableQuery class and |
|
87 its Search() method. |
|
88 """ |
|
89 _FULL_TEXT_INDEX_PROPERTY = '__searchable_text_index' |
|
90 |
|
91 _FULL_TEXT_MIN_LENGTH = 3 |
|
92 |
|
93 _FULL_TEXT_STOP_WORDS = frozenset([ |
|
94 'a', 'about', 'according', 'accordingly', 'affected', 'affecting', 'after', |
|
95 'again', 'against', 'all', 'almost', 'already', 'also', 'although', |
|
96 'always', 'am', 'among', 'an', 'and', 'any', 'anyone', 'apparently', 'are', |
|
97 'arise', 'as', 'aside', 'at', 'away', 'be', 'became', 'because', 'become', |
|
98 'becomes', 'been', 'before', 'being', 'between', 'both', 'briefly', 'but', |
|
99 'by', 'came', 'can', 'cannot', 'certain', 'certainly', 'could', 'did', 'do', |
|
100 'does', 'done', 'during', 'each', 'either', 'else', 'etc', 'ever', 'every', |
|
101 'following', 'for', 'found', 'from', 'further', 'gave', 'gets', 'give', |
|
102 'given', 'giving', 'gone', 'got', 'had', 'hardly', 'has', 'have', 'having', |
|
103 'here', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'itself', |
|
104 'just', 'keep', 'kept', 'knowledge', 'largely', 'like', 'made', 'mainly', |
|
105 'make', 'many', 'might', 'more', 'most', 'mostly', 'much', 'must', 'nearly', |
|
106 'necessarily', 'neither', 'next', 'no', 'none', 'nor', 'normally', 'not', |
|
107 'noted', 'now', 'obtain', 'obtained', 'of', 'often', 'on', 'only', 'or', |
|
108 'other', 'our', 'out', 'owing', 'particularly', 'past', 'perhaps', 'please', |
|
109 'poorly', 'possible', 'possibly', 'potentially', 'predominantly', 'present', |
|
110 'previously', 'primarily', 'probably', 'prompt', 'promptly', 'put', |
|
111 'quickly', 'quite', 'rather', 'readily', 'really', 'recently', 'regarding', |
|
112 'regardless', 'relatively', 'respectively', 'resulted', 'resulting', |
|
113 'results', 'said', 'same', 'seem', 'seen', 'several', 'shall', 'should', |
|
114 'show', 'showed', 'shown', 'shows', 'significantly', 'similar', 'similarly', |
|
115 'since', 'slightly', 'so', 'some', 'sometime', 'somewhat', 'soon', |
|
116 'specifically', 'state', 'states', 'strongly', 'substantially', |
|
117 'successfully', 'such', 'sufficiently', 'than', 'that', 'the', 'their', |
|
118 'theirs', 'them', 'then', 'there', 'therefore', 'these', 'they', 'this', |
|
119 'those', 'though', 'through', 'throughout', 'to', 'too', 'toward', 'under', |
|
120 'unless', 'until', 'up', 'upon', 'use', 'used', 'usefully', 'usefulness', |
|
121 'using', 'usually', 'various', 'very', 'was', 'we', 'were', 'what', 'when', |
|
122 'where', 'whether', 'which', 'while', 'who', 'whose', 'why', 'widely', |
|
123 'will', 'with', 'within', 'without', 'would', 'yet', 'you']) |
|
124 |
|
125 _PUNCTUATION_REGEX = re.compile('[' + re.escape(string.punctuation) + ']') |
|
126 |
|
127 def __init__(self, kind_or_entity, *args, **kwargs): |
|
128 """Constructor. May be called as a copy constructor. |
|
129 |
|
130 If kind_or_entity is a datastore.Entity, copies it into this Entity. |
|
131 datastore.Get() and Query() returns instances of datastore.Entity, so this |
|
132 is useful for converting them back to SearchableEntity so that they'll be |
|
133 indexed when they're stored back in the datastore. |
|
134 |
|
135 Otherwise, passes through the positional and keyword args to the |
|
136 datastore.Entity constructor. |
|
137 |
|
138 Args: |
|
139 kind_or_entity: string or datastore.Entity |
|
140 """ |
|
141 if isinstance(kind_or_entity, datastore.Entity): |
|
142 self._Entity__key = kind_or_entity._Entity__key |
|
143 self.update(kind_or_entity) |
|
144 else: |
|
145 super(SearchableEntity, self).__init__(kind_or_entity, *args, **kwargs) |
|
146 |
|
147 def _ToPb(self): |
|
148 """Rebuilds the full text index, then delegates to the superclass. |
|
149 |
|
150 Returns: |
|
151 entity_pb.Entity |
|
152 """ |
|
153 if SearchableEntity._FULL_TEXT_INDEX_PROPERTY in self: |
|
154 del self[SearchableEntity._FULL_TEXT_INDEX_PROPERTY] |
|
155 |
|
156 index = set() |
|
157 for (name, values) in self.items(): |
|
158 if not isinstance(values, list): |
|
159 values = [values] |
|
160 if (isinstance(values[0], basestring) and |
|
161 not isinstance(values[0], datastore_types.Blob)): |
|
162 for value in values: |
|
163 index.update(SearchableEntity._FullTextIndex(value)) |
|
164 |
|
165 index_list = list(index) |
|
166 if index_list: |
|
167 self[SearchableEntity._FULL_TEXT_INDEX_PROPERTY] = index_list |
|
168 |
|
169 return super(SearchableEntity, self)._ToPb() |
|
170 |
|
171 @classmethod |
|
172 def _FullTextIndex(cls, text): |
|
173 """Returns a set of keywords appropriate for full text indexing. |
|
174 |
|
175 See SearchableQuery.Search() for details. |
|
176 |
|
177 Args: |
|
178 text: string |
|
179 |
|
180 Returns: |
|
181 set of strings |
|
182 """ |
|
183 |
|
184 if text: |
|
185 datastore_types.ValidateString(text, 'text', max_len=sys.maxint) |
|
186 text = cls._PUNCTUATION_REGEX.sub(' ', text) |
|
187 words = text.lower().split() |
|
188 |
|
189 words = set(words) |
|
190 |
|
191 words -= cls._FULL_TEXT_STOP_WORDS |
|
192 for word in list(words): |
|
193 if len(word) < cls._FULL_TEXT_MIN_LENGTH: |
|
194 words.remove(word) |
|
195 |
|
196 else: |
|
197 words = set() |
|
198 |
|
199 return words |
|
200 |
|
201 |
|
202 class SearchableQuery(datastore.Query): |
|
203 """A subclass of datastore.Query that supports full text search. |
|
204 |
|
205 Only searches over entities that were created and stored using the |
|
206 SearchableEntity or SearchableModel classes. |
|
207 """ |
|
208 |
|
209 def Search(self, search_query): |
|
210 """Add a search query. This may be combined with filters. |
|
211 |
|
212 Note that keywords in the search query will be silently dropped if they |
|
213 are stop words or too short, ie if they wouldn't be indexed. |
|
214 |
|
215 Args: |
|
216 search_query: string |
|
217 |
|
218 Returns: |
|
219 # this query |
|
220 SearchableQuery |
|
221 """ |
|
222 datastore_types.ValidateString(search_query, 'search query') |
|
223 self._search_query = search_query |
|
224 return self |
|
225 |
|
226 def _ToPb(self, limit=None, offset=None): |
|
227 """Adds filters for the search query, then delegates to the superclass. |
|
228 |
|
229 Raises BadFilterError if a filter on the index property already exists. |
|
230 |
|
231 Args: |
|
232 # an upper bound on the number of results returned by the query. |
|
233 limit: int |
|
234 # number of results that match the query to skip. limit is applied |
|
235 # after the offset is fulfilled. |
|
236 offset: int |
|
237 |
|
238 Returns: |
|
239 datastore_pb.Query |
|
240 """ |
|
241 if SearchableEntity._FULL_TEXT_INDEX_PROPERTY in self: |
|
242 raise datastore_errors.BadFilterError( |
|
243 '%s is a reserved name.' % SearchableEntity._FULL_TEXT_INDEX_PROPERTY) |
|
244 |
|
245 pb = super(SearchableQuery, self)._ToPb(limit=limit, offset=offset) |
|
246 |
|
247 if hasattr(self, '_search_query'): |
|
248 keywords = SearchableEntity._FullTextIndex(self._search_query) |
|
249 for keyword in keywords: |
|
250 filter = pb.add_filter() |
|
251 filter.set_op(datastore_pb.Query_Filter.EQUAL) |
|
252 prop = filter.add_property() |
|
253 prop.set_name(SearchableEntity._FULL_TEXT_INDEX_PROPERTY) |
|
254 prop.mutable_value().set_stringvalue(keyword) |
|
255 |
|
256 return pb |
|
257 |
|
258 |
|
259 class SearchableModel(db.Model): |
|
260 """A subclass of db.Model that supports full text search and indexing. |
|
261 |
|
262 Automatically indexes all string-based properties. To search, use the all() |
|
263 method to get a SearchableModel.Query, then use its search() method. |
|
264 """ |
|
265 |
|
266 class Query(db.Query): |
|
267 """A subclass of db.Query that supports full text search.""" |
|
268 _search_query = None |
|
269 |
|
270 def search(self, search_query): |
|
271 """Adds a full text search to this query. |
|
272 |
|
273 Args: |
|
274 search_query, a string containing the full text search query. |
|
275 |
|
276 Returns: |
|
277 self |
|
278 """ |
|
279 self._search_query = search_query |
|
280 return self |
|
281 |
|
282 def _get_query(self): |
|
283 """Wraps db.Query._get_query() and injects SearchableQuery.""" |
|
284 query = db.Query._get_query(self, _query_class=SearchableQuery) |
|
285 if self._search_query: |
|
286 query.Search(self._search_query) |
|
287 return query |
|
288 |
|
289 def _populate_internal_entity(self): |
|
290 """Wraps db.Model._populate_internal_entity() and injects |
|
291 SearchableEntity.""" |
|
292 return db.Model._populate_internal_entity(self, |
|
293 _entity_class=SearchableEntity) |
|
294 |
|
295 @classmethod |
|
296 def from_entity(cls, entity): |
|
297 """Wraps db.Model.from_entity() and injects SearchableEntity.""" |
|
298 if not isinstance(entity, SearchableEntity): |
|
299 entity = SearchableEntity(entity) |
|
300 return super(SearchableModel, cls).from_entity(entity) |
|
301 |
|
302 @classmethod |
|
303 def all(cls): |
|
304 """Returns a SearchableModel.Query for this kind.""" |
|
305 return SearchableModel.Query(cls) |