1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18  r"""datastructures.py: Datastructures for search engine core. 
 19   
 20  """ 
 21  __docformat__ = "restructuredtext en" 
 22   
 23  import errors 
 24  from replaylog import log 
 25  import xapian 
 26  import cPickle 
 27   
 29       
 30       
 31      __slots__ = 'name', 'value' 
 32   
 36   
 38          return 'Field(%r, %r)' % (self.name, self.value) 
   39   
 41      """A unprocessed document to be passed to the indexer. 
 42   
 43      This represents an item to be processed and stored in the search engine. 
 44      Each document will be processed by the indexer to generate a 
 45      ProcessedDocument, which can then be stored in the search engine index. 
 46   
 47      Note that some information in an UnprocessedDocument will not be 
 48      represented in the ProcessedDocument: therefore, it is not possible to 
 49      retrieve an UnprocessedDocument from the search engine index. 
 50   
 51      An unprocessed document is a simple container with two attributes: 
 52   
 53       - `fields` is a list of Field objects, or an iterator returning Field 
 54         objects. 
 55       - `id` is a string holding a unique identifier for the document (or 
 56         None to get the database to allocate a unique identifier automatically 
 57         when the document is added). 
 58   
 59      """ 
 60   
 61      __slots__ = 'id', 'fields', 
 62 -    def __init__(self, id=None, fields=None): 
  68   
 70          return 'UnprocessedDocument(%r, %r)' % (self.id, self.fields) 
   71   
 73      """A processed document, as stored in the index. 
 74   
 75      This represents an item which is ready to be stored in the search engine, 
 76      or which has been returned by the search engine. 
 77   
 78      """ 
 79   
 80      __slots__ = '_doc', '_fieldmappings', '_data', 
 81 -    def __init__(self, fieldmappings, xapdoc=None): 
  82          """Create a ProcessedDocument. 
 83   
 84          `fieldmappings` is the configuration from a database connection used lookup 
 85          the configuration to use to store each field. 
 86       
 87          If supplied, `xapdoc` is a Xapian document to store in the processed 
 88          document.  Otherwise, a new Xapian document is created. 
 89   
 90          """ 
 91          if xapdoc is None: 
 92              self._doc = log(xapian.Document) 
 93          else: 
 94              self._doc = xapdoc 
 95          self._fieldmappings = fieldmappings 
 96          self._data = None 
  97   
 98 -    def add_term(self, field, term, wdfinc=1, positions=None): 
  99          """Add a term to the document. 
100   
101          Terms are the main unit of information used for performing searches. 
102   
103          - `field` is the field to add the term to. 
104          - `term` is the term to add. 
105          - `wdfinc` is the value to increase the within-document-frequency 
106            measure for the term by. 
107          - `positions` is the positional information to add for the term. 
108            This may be None to indicate that there is no positional information, 
109            or may be an integer to specify one position, or may be a sequence of 
110            integers to specify several positions.  (Note that the wdf is not 
111            increased automatically for each position: if you add a term at 7 
112            positions, and the wdfinc value is 2, the total wdf for the term will 
113            only be increased by 2, not by 14.) 
114   
115          """ 
116          prefix = self._fieldmappings.get_prefix(field) 
117          if len(term) > 0: 
118               
119               
120               
121              if ord(term[0]) >= ord('A') and ord(term[0]) <= ord('Z'): 
122                  prefix = prefix + ':' 
123   
124           
125           
126           
127           
128           
129           
130           
131           
132           
133           
134           
135           
136          if len(prefix + term) > 220: 
137              raise errors.IndexerError("Field %r is too long: maximum length " 
138                                         "220 - was %d (%r)" % 
139                                         (field, len(prefix + term), 
140                                          prefix + term)) 
141   
142          if positions is None: 
143              self._doc.add_term(prefix + term, wdfinc) 
144          elif isinstance(positions, int): 
145              self._doc.add_posting(prefix + term, positions, wdfinc) 
146          else: 
147              self._doc.add_term(prefix + term, wdfinc) 
148              for pos in positions: 
149                  self._doc.add_posting(prefix + term, pos, 0) 
 150   
151 -    def add_value(self, field, value, purpose=''): 
 152          """Add a value to the document. 
153   
154          Values are additional units of information used when performing 
155          searches.  Note that values are _not_ intended to be used to store 
156          information for display in the search results - use the document data 
157          for that.  The intention is that as little information as possible is 
158          stored in values, so that they can be accessed as quickly as possible 
159          during the search operation. 
160           
161          Unlike terms, each document may have at most one value in each field 
162          (whereas there may be an arbitrary number of terms in a given field). 
163          If an attempt to add multiple values to a single field is made, only 
164          the last value added will be stored. 
165   
166          """ 
167          slot = self._fieldmappings.get_slot(field, purpose) 
168          self._doc.add_value(slot, value) 
 169   
171          """Get a value from the document. 
172   
173          """ 
174          slot = self._fieldmappings.get_slot(field, purpose) 
175          return self._doc.get_value(slot) 
 176   
178          """Prepare the document for adding to a xapian database. 
179   
180          This updates the internal xapian document with any changes which have 
181          been made, and then returns it. 
182   
183          """ 
184          if self._data is not None: 
185              self._doc.set_data(cPickle.dumps(self._data, 2)) 
186              self._data = None 
187          return self._doc 
 188   
190          if self._data is None: 
191              rawdata = self._doc.get_data() 
192              if rawdata == '': 
193                  self._data = {} 
194              else: 
195                  self._data = cPickle.loads(rawdata) 
196          return self._data 
 198          if not isinstance(data, dict): 
199              raise TypeError("Cannot set data to any type other than a dict") 
200          self._data = data 
 201      data = property(_get_data, _set_data, doc= 
202      """The data stored in this processed document. 
203   
204      This data is a dictionary of entries, where the key is a fieldname, and the 
205      value is a list of strings. 
206   
207      """) 
208   
210          tl = self._doc.termlist() 
211          try: 
212              term = tl.skip_to('Q').term 
213              if len(term) == 0 or term[0] != 'Q': 
214                  return None 
215          except StopIteration: 
216              return None 
217          return term[1:] 
 219          tl = self._doc.termlist() 
220          try: 
221              term = tl.skip_to('Q').term 
222          except StopIteration: 
223              term = '' 
224          if len(term) != 0 and term[0] == 'Q': 
225              self._doc.remove_term(term) 
226          if id is not None: 
227              self._doc.add_term('Q' + id, 0) 
 228      id = property(_get_id, _set_id, doc= 
229      """The unique ID for this document. 
230   
231      """) 
232   
234          return '<ProcessedDocument(%r)>' % (self.id) 
  235   
236  if __name__ == '__main__': 
237      import doctest, sys 
238      doctest.testmod (sys.modules[__name__]) 
239