1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18  r"""highlight.py: Highlight and summarise text. 
 19   
 20  """ 
 21  __docformat__ = "restructuredtext en" 
 22   
 23  import re 
 24  import xapian 
 25   
 27      """Class for highlighting text and creating contextual summaries. 
 28   
 29      >>> hl = Highlighter("en") 
 30      >>> hl.makeSample('Hello world.', ['world']) 
 31      'Hello world.' 
 32      >>> hl.highlight('Hello world', ['world'], ('<', '>')) 
 33      'Hello <world>' 
 34   
 35      """ 
 36   
 37       
 38      _split_re = re.compile(r'<\w+[^>]*>|</\w+>|[\w\']+|\s+|[^\w\'\s<>/]+') 
 39   
 40 -    def __init__(self, language_code='en', stemmer=None): 
  41          """Create a new highlighter for the specified language. 
 42   
 43          """ 
 44          if stemmer is not None: 
 45              self.stem = stemmer 
 46          else: 
 47              self.stem = xapian.Stem(language_code) 
  48   
 49 -    def _split_text(self, text, strip_tags=False): 
  50          """Split some text into words and non-words. 
 51   
 52          - `text` is the text to process.  It may be a unicode object or a utf-8 
 53            encoded simple string. 
 54          - `strip_tags` is a flag - False to keep tags, True to strip all tags 
 55            from the output. 
 56   
 57          Returns a list of utf-8 encoded simple strings. 
 58   
 59          """ 
 60          if isinstance(text, unicode): 
 61              text = text.encode('utf-8') 
 62   
 63          words = self._split_re.findall(text) 
 64          if strip_tags: 
 65              return [w for w in words if w[0] != '<'] 
 66          else: 
 67              return words 
  68   
 70          """Strip the prefix off a term. 
 71   
 72          Prefixes are any initial capital letters, with the exception that R always 
 73          ends a prefix, even if followed by capital letters. 
 74   
 75          >>> hl = Highlighter("en") 
 76          >>> print hl._strip_prefix('hello') 
 77          hello 
 78          >>> print hl._strip_prefix('Rhello') 
 79          hello 
 80          >>> print hl._strip_prefix('XARHello') 
 81          Hello 
 82          >>> print hl._strip_prefix('XAhello') 
 83          hello 
 84          >>> print hl._strip_prefix('XAh') 
 85          h 
 86          >>> print hl._strip_prefix('XA') 
 87          <BLANKLINE> 
 88   
 89          """ 
 90          for p in xrange(len(term)): 
 91              if term[p].islower(): 
 92                  return term[p:] 
 93              elif term[p] == 'R': 
 94                  return term[p+1:] 
 95          return '' 
  96   
 98          """Convert a query to a list of stemmed words. 
 99   
100          - `query` is the query to parse: it may be xapian.Query object, or a 
101            sequence of terms. 
102   
103          """ 
104          if isinstance(query, xapian.Query): 
105              return [self._strip_prefix(t) for t in query] 
106          else: 
107              return [self.stem(q.lower()) for q in query] 
 108   
109   
110 -    def makeSample(self, text, query, maxlen=600, hl=None): 
 111          """Make a contextual summary from the supplied text. 
112   
113          This basically works by splitting the text into phrases, counting the query 
114          terms in each, and keeping those with the most. 
115   
116          Any markup tags in the text will be stripped. 
117   
118          `text` is the source text to summarise. 
119          `query` is either a Xapian query object or a list of (unstemmed) term strings. 
120          `maxlen` is the maximum length of the generated summary. 
121          `hl` is a pair of strings to insert around highlighted terms, e.g. ('<b>', '</b>') 
122   
123          """ 
124   
125           
126          maxlen = int(maxlen) 
127   
128          words = self._split_text(text, True) 
129          terms = self._query_to_stemmed_words(query) 
130           
131           
132           
133          blocks = [] 
134          start = end = count = blockchars = 0 
135   
136          while end < len(words): 
137              blockchars += len(words[end]) 
138              if words[end].isalnum(): 
139                  if self.stem(words[end].lower()) in terms: 
140                      count += 1 
141                  end += 1 
142              elif words[end] in ',.;:?!\n': 
143                  end += 1 
144                  blocks.append([start, end, blockchars, count, False]) 
145                  start = end 
146                  blockchars = 0 
147                  count = 0 
148              else: 
149                  end += 1 
150          if start != end: 
151              blocks.append([start, end, blockchars, count, False]) 
152          if len(blocks) == 0: 
153              return '' 
154   
155           
156          chars = 0 
157          for count in xrange(3, -1, -1): 
158              for b in blocks: 
159                  if b[3] >= count: 
160                      b[4] = True 
161                      chars += b[2] 
162                      if chars >= maxlen: break 
163              if chars >= maxlen: break 
164   
165           
166          words2 = [] 
167          lastblock = -1 
168          for i, b in enumerate(blocks): 
169              if b[4]: 
170                  if i != lastblock + 1: 
171                      words2.append('..') 
172                  words2.extend(words[b[0]:b[1]]) 
173                  lastblock = i 
174   
175          if not blocks[-1][4]: 
176              words2.append('..') 
177   
178           
179          l = 0 
180          for i in xrange (len (words2)): 
181              l += len (words2[i]) 
182              if l >= maxlen: 
183                  words2[i:] = ['..'] 
184                  break 
185   
186          if hl is None: 
187              return ''.join(words2) 
188          else: 
189              return self._hl(words2, terms, hl) 
 190   
191 -    def highlight(self, text, query, hl, strip_tags=False): 
 192          """Add highlights (string prefix/postfix) to a string. 
193   
194          `text` is the source to highlight. 
195          `query` is either a Xapian query object or a list of (unstemmed) term strings. 
196          `hl` is a pair of highlight strings, e.g. ('<i>', '</i>') 
197          `strip_tags` strips HTML markout iff True 
198   
199          >>> hl = Highlighter() 
200          >>> qp = xapian.QueryParser() 
201          >>> q = qp.parse_query('cat dog') 
202          >>> tags = ('[[', ']]') 
203          >>> hl.highlight('The cat went Dogging; but was <i>dog tired</i>.', q, tags) 
204          'The [[cat]] went [[Dogging]]; but was <i>[[dog]] tired</i>.' 
205   
206          """ 
207          words = self._split_text(text, strip_tags) 
208          terms = self._query_to_stemmed_words(query) 
209          return self._hl(words, terms, hl) 
 210   
211 -    def _hl(self, words, terms, hl): 
 212          """Add highlights to a list of words. 
213           
214          `words` is the list of words and non-words to be highlighted.. 
215          `terms` is the list of stemmed words to look for. 
216   
217          """ 
218          for i, w in enumerate(words): 
219               
220              wl = w.lower() 
221              if wl in terms or self.stem (wl) in terms: 
222                  words[i] = ''.join((hl[0], w, hl[1])) 
223   
224          return ''.join(words) 
  225   
226   
227  __test__ = { 
228      'no_punc': r''' 
229   
230      Test the highlighter's behaviour when there is no punctuation in the sample 
231      text (regression test - used to return no output): 
232      >>> hl = Highlighter("en") 
233      >>> hl.makeSample('Hello world', ['world']) 
234      'Hello world' 
235   
236      ''', 
237   
238      'stem_levels': r''' 
239   
240      Test highlighting of words, and how it works with stemming: 
241      >>> hl = Highlighter("en") 
242   
243      # "word" and "wording" stem to "word", so the following 4 calls all return 
244      # the same thing 
245      >>> hl.makeSample('Hello. word. wording. wordinging.', ['word'], hl='<>') 
246      'Hello. <word>. <wording>. wordinging.' 
247      >>> hl.highlight('Hello. word. wording. wordinging.', ['word'], '<>') 
248      'Hello. <word>. <wording>. wordinging.' 
249      >>> hl.makeSample('Hello. word. wording. wordinging.', ['wording'], hl='<>') 
250      'Hello. <word>. <wording>. wordinging.' 
251      >>> hl.highlight('Hello. word. wording. wordinging.', ['wording'], '<>') 
252      'Hello. <word>. <wording>. wordinging.' 
253   
254      # "wordinging" stems to "wording", so only the last two words are 
255      # highlighted for this one. 
256      >>> hl.makeSample('Hello. word. wording. wordinging.', ['wordinging'], hl='<>') 
257      'Hello. word. <wording>. <wordinging>.' 
258      >>> hl.highlight('Hello. word. wording. wordinging.', ['wordinging'], '<>') 
259      'Hello. word. <wording>. <wordinging>.' 
260      ''', 
261   
262      'supplied_stemmer': r''' 
263   
264      Test behaviour if we pass in our own stemmer: 
265      >>> stem = xapian.Stem('en') 
266      >>> hl = Highlighter(stemmer=stem) 
267      >>> hl.highlight('Hello. word. wording. wordinging.', ['word'], '<>') 
268      'Hello. <word>. <wording>. wordinging.' 
269   
270      ''', 
271   
272      'unicode': r''' 
273   
274      Test behaviour if we pass in unicode input: 
275      >>> hl = Highlighter('en') 
276      >>> hl.highlight(u'Hello\xf3. word. wording. wordinging.', ['word'], '<>') 
277      'Hello\xc3\xb3. <word>. <wording>. wordinging.' 
278   
279      ''', 
280   
281      'no_sample': r''' 
282   
283      Test behaviour if we pass in unicode input: 
284      >>> hl = Highlighter('en') 
285      >>> hl.makeSample(u'', ['word']) 
286      '' 
287   
288      ''', 
289   
290      'short_samples': r''' 
291   
292      >>> hl = Highlighter('en') 
293      >>> hl.makeSample("A boring start.  Hello world indeed.  A boring end.", ['hello'], 20, ('<', '>')) 
294      '..  <Hello> world ..' 
295      >>> hl.makeSample("A boring start.  Hello world indeed.  A boring end.", ['hello'], 40, ('<', '>')) 
296      'A boring start.  <Hello> world indeed...' 
297      >>> hl.makeSample("A boring start.  Hello world indeed.  A boring end.", ['boring'], 40, ('<', '>')) 
298      'A <boring> start...  A <boring> end.' 
299   
300      ''', 
301   
302      'apostrophes': r''' 
303   
304      >>> hl = Highlighter('en') 
305      >>> hl.makeSample("A boring start.  Hello world's indeed.  A boring end.", ['world'], 40, ('<', '>')) 
306      "A boring start.  Hello <world's> indeed..." 
307   
308      ''', 
309   
310  } 
311   
312  if __name__ == '__main__': 
313      import doctest, sys 
314      doctest.testmod (sys.modules[__name__]) 
315