pprocess

tests/Dict.py

131:26a480cc2c4b
2007-11-24 paulb [project @ 2007-11-24 00:08:29 by paulb] Updated release notes.
     1 #!/usr/bin/env python     2      3 "A simple file indexer."     4      5 import codecs     6 import time     7      8 class Parser:     9     def __init__(self, filenames, encoding=None, delay=None):    10         self.filenames = filenames    11         self.encoding = encoding    12         self.delay = delay    13     14     def _get_file_content(self, filename):    15         if self.encoding is None:    16             f = open(filename)    17         else:    18             f = codecs.open(filename, encoding=self.encoding)    19         s = f.read()    20         f.close()    21         return s    22     23     def send_entries(self, channel):    24     25         "Send word entries from the file."    26     27         for filename in self.filenames:    28             tokens = self._get_file_content(filename).split()    29             index = {}    30     31             words = []    32             for token in tokens:    33                 token = self._strip(token)    34                 if token not in words:    35                     channel.send((token, filename))    36                     words.append(token)    37     38             # Introduce a delay to simulate hard work.    39     40             if self.delay:    41                 time.sleep(self.delay)    42     43     def _strip(self, token):    44     45         "Return the token stripped of non-alphanumeric symbols at each end."    46     47         characters = []    48         in_alphanum = 0    49         for c in token:    50             if not c.isalpha() and not c.isdigit():    51                 if in_alphanum:    52                     break    53             else:    54                 in_alphanum = 1    55                 characters.append(c)    56         return "".join(characters)    57     58 class Indexer:    59     def __init__(self):    60         self.index = {}    61     62     def get_index(self):    63         return self.index    64     65     def add_entry(self, entry):    66     67         "Add the given word 'entry' (token, filename) to the index."    68     69         token, filename = entry    70     71         if not token:    72             return    73     74         slot = self.index    75         for c in token:    76             if not slot.has_key(c):    77                 slot[c] = {}, {}    78             slot, words = slot[c]    79     80         if not words.has_key(token):    81             words[token] = []    82         words[token].append(filename)    83     84 class Searcher:    85     def __init__(self, index):    86         self.index = index    87     88     def find(self, pattern):    89     90         "Find words beginning with the given 'pattern'."    91     92         slot = self.index    93         words = []    94     95         for c in pattern:    96             if not slot.has_key(c):    97                 return []    98             slot, words = slot[c]    99    100         results = {}   101         results.update(words)   102         results.update(self.get_all_words(slot))   103         return results   104    105     def get_all_words(self, slot):   106    107         "Get all words under the given index 'slot'."   108    109         all_words = {}   110         keys = slot.keys()   111         keys.sort()   112         for c in keys:   113             this_slot, words = slot[c]   114             all_words.update(words)   115             all_words.update(self.get_all_words(this_slot))   116         return all_words   117    118 # vim: tabstop=4 expandtab shiftwidth=4