1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18  r"""fieldactions.py: Definitions and implementations of field actions. 
 19   
 20  """ 
 21  __docformat__ = "restructuredtext en" 
 22   
 23  import _checkxapian 
 24  import errors 
 25  import marshall 
 26  from replaylog import log 
 27  import xapian 
 28  import parsedate 
 29   
 30 -def _act_store_content(fieldname, doc, value, context): 
  31      """Perform the STORE_CONTENT action. 
 32       
 33      """ 
 34      try: 
 35          fielddata = doc.data[fieldname] 
 36      except KeyError: 
 37          fielddata = [] 
 38          doc.data[fieldname] = fielddata 
 39      fielddata.append(value) 
  40   
 42      """Perform the INDEX_EXACT action. 
 43       
 44      """ 
 45      doc.add_term(fieldname, value, 0) 
  46   
 47 -def _act_tag(fieldname, doc, value, context): 
  48      """Perform the TAG action. 
 49       
 50      """ 
 51      doc.add_term(fieldname, value.lower(), 0) 
  52   
 53 -def _act_facet(fieldname, doc, value, context, type=None): 
  54      """Perform the FACET action. 
 55       
 56      """ 
 57      if type is None or type == 'string': 
 58          value = value.lower() 
 59          doc.add_term(fieldname, value, 0) 
 60          serialiser = log(xapian.StringListSerialiser, 
 61                            doc.get_value(fieldname, 'facet')) 
 62          serialiser.append(value) 
 63          doc.add_value(fieldname, serialiser.get(), 'facet') 
 64      else: 
 65          marshaller = SortableMarshaller() 
 66          fn = marshaller.get_marshall_function(fieldname, type) 
 67          doc.add_value(fieldname, fn(fieldname, value), 'facet') 
  68   
 69 -def _act_index_freetext(fieldname, doc, value, context, weight=1,  
 70                          language=None, stop=None, spell=False, 
 71                          nopos=False, 
 72                          allow_field_specific=True, 
 73                          search_by_default=True): 
  74      """Perform the INDEX_FREETEXT action. 
 75       
 76      """ 
 77      termgen = log(xapian.TermGenerator) 
 78      if language is not None: 
 79          termgen.set_stemmer(log(xapian.Stem, language)) 
 80           
 81      if stop is not None: 
 82          stopper = log(xapian.SimpleStopper) 
 83          for term in stop: 
 84              stopper.add (term) 
 85          termgen.set_stopper (stopper) 
 86   
 87      if spell: 
 88          termgen.set_database(context.index) 
 89          termgen.set_flags(termgen.FLAG_SPELLING) 
 90       
 91      termgen.set_document(doc._doc) 
 92   
 93      if search_by_default: 
 94          termgen.set_termpos(context.current_position) 
 95           
 96           
 97          if nopos: 
 98              termgen.index_text_without_positions(value, weight, '') 
 99          else: 
100              termgen.index_text(value, weight, '') 
101   
102      if allow_field_specific: 
103           
104           
105          prefix = doc._fieldmappings.get_prefix(fieldname) 
106          if len(prefix) != 0: 
107              termgen.set_termpos(context.current_position) 
108              if nopos: 
109                  termgen.index_text_without_positions(value, weight, prefix) 
110              else: 
111                  termgen.index_text(value, weight, prefix) 
112   
113       
114       
115      termgen.increase_termpos(10) 
116      context.current_position = termgen.get_termpos() 
 117   
119      """Implementation of marshalling for sortable values. 
120   
121      """ 
127   
129          """Marshall a value for sorting in lexicograpical order. 
130   
131          This returns the input as the output, since strings already sort in 
132          lexicographical order. 
133   
134          """ 
135          return value 
 136   
138          """Marshall a value for sorting as a floating point value. 
139   
140          """ 
141           
142          try: 
143              value = float(value) 
144          except ValueError: 
145              raise self._err("Value supplied to field %r must be a " 
146                              "valid floating point number: was %r" % 
147                              (fieldname, value)) 
148          return marshall.float_to_string(value) 
 149   
161   
163          """Get a function used to marshall values of a given sorttype. 
164   
165          """ 
166          try: 
167              return { 
168                  None: self.marshall_string, 
169                  'string': self.marshall_string, 
170                  'float': self.marshall_float, 
171                  'date': self.marshall_date, 
172              }[sorttype] 
173          except KeyError: 
174              raise self._err("Unknown sort type %r for field %r" % 
175                              (sorttype, fieldname)) 
  176   
177   
186   
187 -class ActionContext(object): 
 188      """The context in which an action is performed. 
189   
190      This is just used to pass term generators, word positions, and the like 
191      around. 
192   
193      """ 
194 -    def __init__(self, index): 
 195          self.current_language = None 
196          self.current_position = 0 
197          self.index = index 
  198   
200      """An object describing the actions to be performed on a field. 
201   
202      The supported actions are: 
203       
204      - `STORE_CONTENT`: store the unprocessed content of the field in the search 
205        engine database.  All fields which need to be displayed or used when 
206        displaying the search results need to be given this action. 
207   
208      - `INDEX_EXACT`: index the exact content of the field as a single search 
209        term.  Fields whose contents need to be searchable as an "exact match" 
210        need to be given this action. 
211   
212      - `INDEX_FREETEXT`: index the content of this field as text.  The content 
213        will be split into terms, allowing free text searching of the field.  Four 
214        optional parameters may be supplied: 
215   
216        - 'weight' is a multiplier to apply to the importance of the field.  This 
217          must be an integer, and the default value is 1. 
218        - 'language' is the language to use when processing the field.  This can 
219          be expressed as an ISO 2-letter language code.  The supported languages 
220          are those supported by the xapian core in use. 
221        - 'stop' is an iterable of stopwords to filter out of the generated 
222          terms.  Note that due to Xapian design, only non-positional terms are 
223          affected, so this is of limited use. 
224        - 'spell' is a boolean flag - if true, the contents of the field will be 
225          used for spelling correction. 
226        - 'nopos' is a boolean flag - if true, positional information is not 
227          stored. 
228        - 'allow_field_specific' is a boolean flag - if False, prevents terms with the field 
229          prefix being generated.  This means that searches specific to this 
230          field will not work, and thus should only be used when only non-field 
231          specific searches are desired.  Defaults to True. 
232        - 'search_by_default' is a boolean flag - if False, the field will not be 
233          searched by non-field specific searches.  If True, or omitted, the 
234          field will be included in searches for non field-specific searches. 
235   
236      - `SORTABLE`: index the content of the field such that it can be used to 
237        sort result sets.  It also allows result sets to be restricted to those 
238        documents with a field values in a given range.  One optional parameter 
239        may be supplied: 
240   
241        - 'type' is a value indicating how to sort the field.  It has several 
242          possible values: 
243   
244          - 'string' - sort in lexicographic (ie, alphabetical) order. 
245            This is the default, used if no type is set. 
246          - 'float' - treat the values as (decimal representations of) floating 
247            point numbers, and sort in numerical order.  The values in the field 
248            must be valid floating point numbers (according to Python's float() 
249            function). 
250          - 'date' - sort in date order.  The values must be valid dates (either 
251            Python datetime.date objects, or ISO 8601 format (ie, YYYYMMDD or 
252            YYYY-MM-DD). 
253   
254      - `COLLAPSE`: index the content of the field such that it can be used to 
255        "collapse" result sets, such that only the highest result with each value 
256        of the field will be returned. 
257   
258      - `TAG`: the field contains tags; these are strings, which will be matched 
259        in a case insensitive way, but otherwise must be exact matches.  Tag 
260        fields can be searched for by making an explict query (ie, using 
261        query_field(), but not with query_parse()).  A list of the most frequent 
262        tags in a result set can also be accessed easily. 
263   
264      - `FACET`: the field represents a classification facet; these are strings 
265        which will be matched exactly, but a list of all the facets present in 
266        the result set can also be accessed easily - in addition, a suitable 
267        subset of the facets, and a selection of the facet values, present in the 
268        result set can be calculated.  One optional parameter may be supplied: 
269   
270        - 'type' is a value indicating the type of facet contained in the field: 
271   
272          - 'string' - the facet values are exact binary strings. 
273          - 'float' - the facet values are floating point numbers. 
274   
275      """ 
276   
277       
278      STORE_CONTENT = 1 
279      INDEX_EXACT = 2 
280      INDEX_FREETEXT = 3 
281      SORTABLE = 4  
282      COLLAPSE = 5 
283      TAG = 6 
284      FACET = 7 
285   
286       
287       
288       
289      SORT_AND_COLLAPSE = -1 
290   
291      _unsupported_actions = [] 
292   
293      if 'tags' in _checkxapian.missing_features: 
294          _unsupported_actions.append(TAG) 
295      if 'facets' in _checkxapian.missing_features: 
296          _unsupported_actions.append(FACET) 
297   
299           
300          self._actions = {} 
301          self._fieldname = fieldname 
 302   
303 -    def add(self, field_mappings, action, **kwargs): 
 304          """Add an action to perform on a field. 
305   
306          """ 
307          if action in self._unsupported_actions: 
308              raise errors.IndexerError("Action unsupported with this release of xapian") 
309   
310          if action not in (FieldActions.STORE_CONTENT, 
311                            FieldActions.INDEX_EXACT, 
312                            FieldActions.INDEX_FREETEXT, 
313                            FieldActions.SORTABLE, 
314                            FieldActions.COLLAPSE, 
315                            FieldActions.TAG, 
316                            FieldActions.FACET, 
317                           ): 
318              raise errors.IndexerError("Unknown field action: %r" % action) 
319   
320          info = self._action_info[action] 
321   
322           
323          for key in kwargs.keys(): 
324              if key not in info[1]: 
325                  raise errors.IndexerError("Unknown parameter name for action %r: %r" % (info[0], key)) 
326   
327           
328           
329           
330          if action == FieldActions.INDEX_EXACT: 
331              if FieldActions.INDEX_FREETEXT in self._actions: 
332                  raise errors.IndexerError("Field %r is already marked for indexing " 
333                                     "as free text: cannot mark for indexing " 
334                                     "as exact text as well" % self._fieldname) 
335          if action == FieldActions.INDEX_FREETEXT: 
336              if FieldActions.INDEX_EXACT in self._actions: 
337                  raise errors.IndexerError("Field %r is already marked for indexing " 
338                                     "as exact text: cannot mark for indexing " 
339                                     "as free text as well" % self._fieldname) 
340   
341           
342           
343           
344           
345           
346          if action == FieldActions.SORTABLE or action == FieldActions.COLLAPSE: 
347              if action == FieldActions.COLLAPSE: 
348                  sorttype = None 
349              else: 
350                  try: 
351                      sorttype = kwargs['type'] 
352                  except KeyError: 
353                      sorttype = 'string' 
354              kwargs['type'] = sorttype 
355              action = FieldActions.SORT_AND_COLLAPSE 
356   
357              try: 
358                  oldsortactions = self._actions[FieldActions.SORT_AND_COLLAPSE] 
359              except KeyError: 
360                  oldsortactions = () 
361   
362              if len(oldsortactions) > 0: 
363                  for oldsortaction in oldsortactions: 
364                      oldsorttype = oldsortaction['type'] 
365   
366                  if sorttype == oldsorttype or oldsorttype is None: 
367                       
368                      self._actions[action] = [] 
369                  elif sorttype is None: 
370                       
371                      return 
372                  else: 
373                      raise errors.IndexerError("Field %r is already marked for " 
374                                                 "sorting, with a different " 
375                                                 "sort type" % self._fieldname) 
376   
377          if 'prefix' in info[3]: 
378              field_mappings.add_prefix(self._fieldname) 
379          if 'slot' in info[3]: 
380              purposes = info[3]['slot'] 
381              if isinstance(purposes, basestring): 
382                  field_mappings.add_slot(self._fieldname, purposes) 
383              else: 
384                  slotnum = None 
385                  for purpose in purposes: 
386                      slotnum = field_mappings.get_slot(self._fieldname, purpose) 
387                      if slotnum is not None: 
388                          break 
389                  for purpose in purposes: 
390                      field_mappings.add_slot(self._fieldname, purpose, slotnum=slotnum) 
391   
392           
393          if action not in self._actions: 
394              self._actions[action] = [] 
395   
396           
397          for old_action in self._actions[action]: 
398              if old_action == kwargs: 
399                  return 
400   
401           
402          self._actions[action].append(kwargs) 
 403   
416   
417      _action_info = { 
418          STORE_CONTENT: ('STORE_CONTENT', (), _act_store_content, {}, ), 
419          INDEX_EXACT: ('INDEX_EXACT', (), _act_index_exact, {'prefix': True}, ), 
420          INDEX_FREETEXT: ('INDEX_FREETEXT', ('weight', 'language', 'stop', 'spell', 'nopos', 'allow_field_specific', 'search_by_default', ),  
421              _act_index_freetext, {'prefix': True, }, ), 
422          SORTABLE: ('SORTABLE', ('type', ), None, {'slot': 'collsort',}, ), 
423          COLLAPSE: ('COLLAPSE', (), None, {'slot': 'collsort',}, ), 
424          TAG: ('TAG', (), _act_tag, {'prefix': True,}, ), 
425          FACET: ('FACET', ('type', ), _act_facet, {'prefix': True, 'slot': 'facet',}, ), 
426   
427          SORT_AND_COLLAPSE: ('SORT_AND_COLLAPSE', ('type', ), _act_sort_and_collapse, {'slot': 'collsort',}, ), 
428      } 
 429   
430  if __name__ == '__main__': 
431      import doctest, sys 
432      doctest.testmod (sys.modules[__name__]) 
433