1.1 --- a/iixr.py Thu Aug 27 00:02:50 2009 +0200
1.2 +++ b/iixr.py Thu Aug 27 20:52:48 2009 +0200
1.3 @@ -18,9 +18,15 @@
1.4 with this program. If not, see <http://www.gnu.org/licenses/>.
1.5 """
1.6
1.7 +from os import mkdir # to determine whether to create indexes
1.8 +from os.path import exists, join
1.9 from os.path import commonprefix # to find common string prefixes
1.10 from bisect import bisect_right # to find terms in the dictionary index
1.11
1.12 +# Constants.
1.13 +
1.14 +INTERVAL = 100
1.15 +
1.16 # Foundation classes.
1.17
1.18 class File:
1.19 @@ -35,7 +41,9 @@
1.20 pass
1.21
1.22 def close(self):
1.23 - self.f.close()
1.24 + if self.f is not None:
1.25 + self.f.close()
1.26 + self.f = None
1.27
1.28 class FileWriter(File):
1.29
1.30 @@ -74,6 +82,11 @@
1.31
1.32 "Write 's' to the file, recording its length."
1.33
1.34 + # Convert Unicode objects to strings.
1.35 +
1.36 + if isinstance(s, unicode):
1.37 + s = s.encode("utf-8")
1.38 +
1.39 length = len(s)
1.40
1.41 if not (0 <= length <= 255):
1.42 @@ -115,7 +128,10 @@
1.43 "Read a string from the file."
1.44
1.45 length = self.read_number()
1.46 - return self.f.read(length)
1.47 +
1.48 + # Convert strings to Unicode objects.
1.49 +
1.50 + return unicode(self.f.read(length), "utf-8")
1.51
1.52 # Specific classes.
1.53
1.54 @@ -141,6 +157,10 @@
1.55
1.56 self.write_number(len(positions))
1.57
1.58 + # Make sure that the positions are sorted.
1.59 +
1.60 + positions.sort()
1.61 +
1.62 # Write the position deltas.
1.63
1.64 last = 0
1.65 @@ -492,6 +512,8 @@
1.66 doc.append(position)
1.67
1.68 def close(self):
1.69 + if self.dict_writer is None:
1.70 + return
1.71
1.72 # Get the terms in order.
1.73
1.74 @@ -504,5 +526,63 @@
1.75 self.dict_writer.write_term_positions(term, doc_positions)
1.76
1.77 self.dict_writer.close()
1.78 + self.dict_writer = None
1.79 +
1.80 +class Index:
1.81 +
1.82 + "An inverted index solution encapsulating the various components."
1.83 +
1.84 + def __init__(self, pathname):
1.85 + self.pathname = pathname
1.86 + self.reader = None
1.87 + self.writer = None
1.88 +
1.89 + def get_writer(self, interval=INTERVAL):
1.90 +
1.91 + "Return a writer, optionally using the given indexing 'interval'."
1.92 +
1.93 + if not exists(self.pathname):
1.94 + mkdir(self.pathname)
1.95 +
1.96 + tdf = open(join(self.pathname, "terms"), "wb")
1.97 + info_writer = TermWriter(tdf)
1.98 +
1.99 + tdif = open(join(self.pathname, "index"), "wb")
1.100 + index_writer = TermIndexWriter(tdif)
1.101 +
1.102 + tpf = open(join(self.pathname, "positions"), "wb")
1.103 + positions_writer = PositionWriter(tpf)
1.104 +
1.105 + dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)
1.106 +
1.107 + self.writer = IndexWriter(dict_writer)
1.108 + return self.writer
1.109 +
1.110 + def get_reader(self):
1.111 +
1.112 + "Return a reader for the index."
1.113 +
1.114 + if not exists(self.pathname):
1.115 + raise OSError, "Index path %r does not exist." % self.pathname
1.116 +
1.117 + tdf = open(join(self.pathname, "terms"), "rb")
1.118 + info_reader = TermReader(tdf)
1.119 +
1.120 + tdif = open(join(self.pathname, "index"), "rb")
1.121 + index_reader = TermIndexReader(tdif)
1.122 +
1.123 + tpf = open(join(self.pathname, "positions"), "rb")
1.124 + positions_reader = PositionReader(tpf)
1.125 +
1.126 + self.reader = TermDictionaryReader(info_reader, index_reader, positions_reader)
1.127 + return self.reader
1.128 +
1.129 + def close(self):
1.130 + if self.reader is not None:
1.131 + self.reader.close()
1.132 + self.reader = None
1.133 + if self.writer is not None:
1.134 + self.writer.close()
1.135 + self.writer = None
1.136
1.137 # vim: tabstop=4 expandtab shiftwidth=4
2.1 --- a/test.py Thu Aug 27 00:02:50 2009 +0200
2.2 +++ b/test.py Thu Aug 27 20:52:48 2009 +0200
2.3 @@ -192,29 +192,17 @@
2.4 ("sea", [(36, [2, 6])])
2.5 ]
2.6
2.7 -f = open("test", "wb")
2.8 -w = iixr.TermWriter(f)
2.9 -f2 = open("testI", "wb")
2.10 -w2 = iixr.TermIndexWriter(f2)
2.11 -f3 = open("testP", "wb")
2.12 -w3 = iixr.PositionWriter(f3)
2.13 -wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
2.14 -wi = iixr.IndexWriter(wd)
2.15 +index = iixr.Index("test_index")
2.16 +wi = index.get_writer(3)
2.17 for docnum, text in docs:
2.18 for position, term in enumerate(text.split()):
2.19 wi.add_position(term, docnum, position)
2.20 wi.close()
2.21
2.22 -f = open("test", "rb")
2.23 -r = iixr.TermReader(f)
2.24 -f2 = open("testI", "rb")
2.25 -r2 = iixr.TermIndexReader(f2)
2.26 -f3 = open("testP", "rb")
2.27 -r3 = iixr.PositionReader(f3)
2.28 -rd = iixr.TermDictionaryReader(r, r2, r3)
2.29 +rd = index.get_reader()
2.30 for term, doc_positions in doc_tests:
2.31 dp = rd.find_positions(term)
2.32 print doc_positions == dp, doc_positions, dp
2.33 -rd.close()
2.34 +index.close()
2.35
2.36 # vim: tabstop=4 expandtab shiftwidth=4