1 #!/usr/bin/env python 2 3 "A simple file indexer." 4 5 import codecs 6 import time 7 8 class Parser: 9 def __init__(self, filenames, encoding=None, delay=None): 10 self.filenames = filenames 11 self.encoding = encoding 12 self.delay = delay 13 14 def _get_file_content(self, filename): 15 if self.encoding is None: 16 f = open(filename) 17 else: 18 f = codecs.open(filename, encoding=self.encoding) 19 s = f.read() 20 f.close() 21 return s 22 23 def send_entries(self, channel): 24 25 "Send word entries from the file." 26 27 for filename in self.filenames: 28 tokens = self._get_file_content(filename).split() 29 index = {} 30 31 words = [] 32 for token in tokens: 33 token = self._strip(token) 34 if token not in words: 35 channel.send((token, filename)) 36 words.append(token) 37 38 # Introduce a delay to simulate hard work. 39 40 if self.delay: 41 time.sleep(self.delay) 42 43 def _strip(self, token): 44 45 "Return the token stripped of non-alphanumeric symbols at each end." 46 47 characters = [] 48 in_alphanum = 0 49 for c in token: 50 if not c.isalpha() and not c.isdigit(): 51 if in_alphanum: 52 break 53 else: 54 in_alphanum = 1 55 characters.append(c) 56 return "".join(characters) 57 58 class Indexer: 59 def __init__(self): 60 self.index = {} 61 62 def get_index(self): 63 return self.index 64 65 def add_entry(self, entry): 66 67 "Add the given word 'entry' (token, filename) to the index." 68 69 token, filename = entry 70 71 if not token: 72 return 73 74 slot = self.index 75 for c in token: 76 if not slot.has_key(c): 77 slot[c] = {}, {} 78 slot, words = slot[c] 79 80 if not words.has_key(token): 81 words[token] = [] 82 words[token].append(filename) 83 84 class Searcher: 85 def __init__(self, index): 86 self.index = index 87 88 def find(self, pattern): 89 90 "Find words beginning with the given 'pattern'." 91 92 slot = self.index 93 words = [] 94 95 for c in pattern: 96 if not slot.has_key(c): 97 return [] 98 slot, words = slot[c] 99 100 results = {} 101 results.update(words) 102 results.update(self.get_all_words(slot)) 103 return results 104 105 def get_all_words(self, slot): 106 107 "Get all words under the given index 'slot'." 108 109 all_words = {} 110 keys = slot.keys() 111 keys.sort() 112 for c in keys: 113 this_slot, words = slot[c] 114 all_words.update(words) 115 all_words.update(self.get_all_words(this_slot)) 116 return all_words 117 118 # vim: tabstop=4 expandtab shiftwidth=4