1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/iixr/__init__.py Tue Sep 15 00:15:11 2009 +0200
1.3 @@ -0,0 +1,23 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +A simple (and sane) text indexing library.
1.8 +
1.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
1.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
1.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
1.19 +
1.20 +You should have received a copy of the GNU General Public License along
1.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
1.22 +"""
1.23 +
1.24 +from iixr.index import *
1.25 +
1.26 +# vim: tabstop=4 expandtab shiftwidth=4
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
2.2 +++ b/iixr/data.py Tue Sep 15 00:15:11 2009 +0200
2.3 @@ -0,0 +1,64 @@
2.4 +#!/usr/bin/env python
2.5 +
2.6 +"""
2.7 +Variable-length integer functions.
2.8 +
2.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
2.10 +
2.11 +This program is free software; you can redistribute it and/or modify it under
2.12 +the terms of the GNU General Public License as published by the Free Software
2.13 +Foundation; either version 3 of the License, or (at your option) any later
2.14 +version.
2.15 +
2.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
2.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
2.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
2.19 +
2.20 +You should have received a copy of the GNU General Public License along
2.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
2.22 +"""
2.23 +
2.24 +try:
2.25 + from vint import vint as _vint
2.26 +
2.27 + def vint(number):
2.28 +
2.29 + "Write 'number' as a variable-length integer."
2.30 +
2.31 + if number >= 0:
2.32 + return _vint(number)
2.33 + else:
2.34 + raise ValueError, "Number %r is negative." % number
2.35 +
2.36 +except ImportError:
2.37 +
2.38 + def vint(number):
2.39 +
2.40 + "Write 'number' as a variable-length integer."
2.41 +
2.42 + if number >= 0:
2.43 +
2.44 + # Special case: one byte containing a 7-bit number.
2.45 +
2.46 + if number < 128:
2.47 + return chr(number)
2.48 +
2.49 + # Write the number from least to most significant digits.
2.50 +
2.51 + bytes = []
2.52 +
2.53 + while number != 0:
2.54 + lsd = number & 127
2.55 + number = number >> 7
2.56 + if number != 0:
2.57 + lsd |= 128
2.58 + bytes.append(chr(lsd))
2.59 +
2.60 + return "".join(bytes)
2.61 +
2.62 + # Negative numbers are not supported.
2.63 +
2.64 + else:
2.65 + raise ValueError, "Number %r is negative." % number
2.66 +
2.67 +# vim: tabstop=4 expandtab shiftwidth=4
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
3.2 +++ b/iixr/fields.py Tue Sep 15 00:15:11 2009 +0200
3.3 @@ -0,0 +1,256 @@
3.4 +#!/usr/bin/env python
3.5 +
3.6 +"""
3.7 +Specific classes for storing document information.
3.8 +
3.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
3.10 +
3.11 +This program is free software; you can redistribute it and/or modify it under
3.12 +the terms of the GNU General Public License as published by the Free Software
3.13 +Foundation; either version 3 of the License, or (at your option) any later
3.14 +version.
3.15 +
3.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
3.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
3.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
3.19 +
3.20 +You should have received a copy of the GNU General Public License along
3.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
3.22 +"""
3.23 +
3.24 +from iixr.files import *
3.25 +from bisect import bisect_right # to find terms in the dictionary index
3.26 +
3.27 +class FieldWriter(FileWriter):
3.28 +
3.29 + "Writing field data to files."
3.30 +
3.31 + def reset(self):
3.32 + self.last_docnum = 0
3.33 +
3.34 + def write_fields(self, docnum, fields):
3.35 +
3.36 + """
3.37 + Write for the given 'docnum', a list of 'fields' (integer, string pairs
3.38 + representing field identifiers and values respectively).
3.39 + Return the offset at which the fields are stored.
3.40 + """
3.41 +
3.42 + offset = self.tell()
3.43 +
3.44 + # Write the document number delta.
3.45 +
3.46 + self.write_number(docnum - self.last_docnum)
3.47 +
3.48 + # Write the number of fields.
3.49 +
3.50 + self.write_number(len(fields))
3.51 +
3.52 + # Write the fields themselves.
3.53 +
3.54 + for i, field in fields:
3.55 + self.write_number(i)
3.56 + self.write_string(field, 1) # compress
3.57 +
3.58 + self.last_docnum = docnum
3.59 + return offset
3.60 +
3.61 +class FieldReader(FileReader):
3.62 +
3.63 + "Reading field data from files."
3.64 +
3.65 + def reset(self):
3.66 + self.last_docnum = 0
3.67 +
3.68 + def read_fields(self):
3.69 +
3.70 + """
3.71 + Read fields from the file, returning a tuple containing the document
3.72 + number and a list of field (identifier, value) pairs.
3.73 + """
3.74 +
3.75 + # Read the document number.
3.76 +
3.77 + self.last_docnum += self.read_number()
3.78 +
3.79 + # Read the number of fields.
3.80 +
3.81 + nfields = self.read_number()
3.82 +
3.83 + # Collect the fields.
3.84 +
3.85 + fields = []
3.86 + i = 0
3.87 +
3.88 + while i < nfields:
3.89 + identifier = self.read_number()
3.90 + value = self.read_string(1) # decompress
3.91 + fields.append((identifier, value))
3.92 + i += 1
3.93 +
3.94 + return self.last_docnum, fields
3.95 +
3.96 + def read_document_fields(self, docnum, offset):
3.97 +
3.98 + """
3.99 + Read fields for 'docnum' at the given 'offset'. This permits the
3.100 + retrieval of details for the specified document, as well as scanning for
3.101 + later documents.
3.102 + """
3.103 +
3.104 + self.seek(offset)
3.105 + bad_docnum, fields = self.read_fields()
3.106 + self.last_docnum = docnum
3.107 + return docnum, fields
3.108 +
3.109 +class FieldIndexWriter(FileWriter):
3.110 +
3.111 + "Writing field index details to files."
3.112 +
3.113 + def reset(self):
3.114 + self.last_docnum = 0
3.115 + self.last_offset = 0
3.116 +
3.117 + def write_document(self, docnum, offset):
3.118 +
3.119 + """
3.120 + Write for the given 'docnum', the 'offset' at which the fields for the
3.121 + document are stored in the fields file.
3.122 + """
3.123 +
3.124 + # Write the document number and offset deltas.
3.125 +
3.126 + self.write_number(docnum - self.last_docnum)
3.127 + self.write_number(offset - self.last_offset)
3.128 +
3.129 + self.last_docnum = docnum
3.130 + self.last_offset = offset
3.131 +
3.132 +class FieldIndexReader(FileReader):
3.133 +
3.134 + "Reading field index details from files."
3.135 +
3.136 + def reset(self):
3.137 + self.last_docnum = 0
3.138 + self.last_offset = 0
3.139 +
3.140 + def read_document(self):
3.141 +
3.142 + "Read a document number and field file offset."
3.143 +
3.144 + # Read the document number delta and offset.
3.145 +
3.146 + self.last_docnum += self.read_number()
3.147 + self.last_offset += self.read_number()
3.148 +
3.149 + return self.last_docnum, self.last_offset
3.150 +
3.151 +class FieldDictionaryWriter:
3.152 +
3.153 + "Writing field dictionary details."
3.154 +
3.155 + def __init__(self, field_writer, field_index_writer, interval):
3.156 + self.field_writer = field_writer
3.157 + self.field_index_writer = field_index_writer
3.158 + self.interval = interval
3.159 + self.entry = 0
3.160 +
3.161 + def write_fields(self, docnum, fields):
3.162 +
3.163 + "Write details of the document with the given 'docnum' and 'fields'."
3.164 +
3.165 + offset = self.field_writer.write_fields(docnum, fields)
3.166 +
3.167 + if self.entry % self.interval == 0:
3.168 + self.field_index_writer.write_document(docnum, offset)
3.169 +
3.170 + self.entry += 1
3.171 +
3.172 + def close(self):
3.173 + self.field_writer.close()
3.174 + self.field_index_writer.close()
3.175 +
3.176 +class FieldDictionaryReader:
3.177 +
3.178 + "Reading field dictionary details."
3.179 +
3.180 + def __init__(self, field_reader, field_index_reader):
3.181 + self.field_reader = field_reader
3.182 + self.field_index_reader = field_index_reader
3.183 +
3.184 + self.docs = []
3.185 + try:
3.186 + while 1:
3.187 + self.docs.append(self.field_index_reader.read_document())
3.188 + except EOFError:
3.189 + pass
3.190 +
3.191 + # Large numbers for ordering purposes.
3.192 +
3.193 + if self.docs:
3.194 + self.max_offset = self.docs[-1][1]
3.195 + else:
3.196 + self.max_offset = None
3.197 +
3.198 + # Iterator convenience methods.
3.199 +
3.200 + def __iter__(self):
3.201 + self.rewind()
3.202 + return self
3.203 +
3.204 + def next(self):
3.205 + try:
3.206 + return self.read_fields()
3.207 + except EOFError:
3.208 + raise StopIteration
3.209 +
3.210 + # Sequential access methods.
3.211 +
3.212 + def rewind(self):
3.213 + self.field_reader.rewind()
3.214 +
3.215 + def read_fields(self):
3.216 +
3.217 + "Return the next document number and fields."
3.218 +
3.219 + return self.field_reader.read_fields()
3.220 +
3.221 + # Random access methods.
3.222 +
3.223 + def get_fields(self, docnum):
3.224 +
3.225 + "Read the fields of the document with the given 'docnum'."
3.226 +
3.227 + i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
3.228 +
3.229 + # Get the entry position providing the term or one preceding it.
3.230 +
3.231 + if i == -1:
3.232 + return None
3.233 +
3.234 + found_docnum, offset = self.docs[i]
3.235 +
3.236 + # Read from the fields file.
3.237 +
3.238 + found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
3.239 +
3.240 + # Scan for the document, if necessary.
3.241 +
3.242 + try:
3.243 + while docnum > found_docnum:
3.244 + found_docnum, fields = self.field_reader.read_fields()
3.245 + except EOFError:
3.246 + pass
3.247 +
3.248 + # If the document is found, return the fields.
3.249 +
3.250 + if docnum == found_docnum:
3.251 + return fields
3.252 + else:
3.253 + return None
3.254 +
3.255 + def close(self):
3.256 + self.field_reader.close()
3.257 + self.field_index_reader.close()
3.258 +
3.259 +# vim: tabstop=4 expandtab shiftwidth=4
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2 +++ b/iixr/files.py Tue Sep 15 00:15:11 2009 +0200
4.3 @@ -0,0 +1,264 @@
4.4 +#!/usr/bin/env python
4.5 +
4.6 +"""
4.7 +Generic file access.
4.8 +
4.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
4.10 +
4.11 +This program is free software; you can redistribute it and/or modify it under
4.12 +the terms of the GNU General Public License as published by the Free Software
4.13 +Foundation; either version 3 of the License, or (at your option) any later
4.14 +version.
4.15 +
4.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
4.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
4.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
4.19 +
4.20 +You should have received a copy of the GNU General Public License along
4.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
4.22 +"""
4.23 +
4.24 +from iixr.data import vint
4.25 +import bz2, zlib
4.26 +
4.27 +# Constants.
4.28 +
4.29 +WRITE_CACHE_SIZE = 100000
4.30 +READ_CACHE_SIZE = 10000
4.31 +READ_CACHE_RESIZE = 5000
4.32 +
4.33 +compressors = [("b", bz2.compress), ("z", zlib.compress)]
4.34 +decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
4.35 +
4.36 +class File:
4.37 +
4.38 + "A basic file abstraction."
4.39 +
4.40 + def __init__(self, f):
4.41 + self.f = f
4.42 + self.reset()
4.43 +
4.44 + def reset(self):
4.45 +
4.46 + "To be used to reset the state of the reader or writer between records."
4.47 +
4.48 + pass
4.49 +
4.50 + def rewind(self):
4.51 + self.seek(0)
4.52 + self.reset()
4.53 +
4.54 + def seek(self, offset):
4.55 +
4.56 + "To be defined by readers."
4.57 +
4.58 + pass
4.59 +
4.60 + def flush(self):
4.61 +
4.62 + "To be defined by writers."
4.63 +
4.64 + pass
4.65 +
4.66 + def close(self):
4.67 + if self.f is not None:
4.68 + self.flush()
4.69 + self.f.close()
4.70 + self.f = None
4.71 +
4.72 +class FileWriter(File):
4.73 +
4.74 + "Writing basic data types to files."
4.75 +
4.76 + def __init__(self, f):
4.77 + File.__init__(self, f)
4.78 + self.cache = []
4.79 + self.cache_length = 0
4.80 +
4.81 + def write_number(self, number):
4.82 +
4.83 + "Write 'number' to the file using a variable length encoding."
4.84 +
4.85 + self.write(vint(number))
4.86 +
4.87 + def write_string(self, s, compress=0):
4.88 +
4.89 + """
4.90 + Write 's' to the file, recording its length and compressing the string
4.91 + if 'compress' is set to a true value.
4.92 + """
4.93 +
4.94 + # Convert Unicode objects to strings.
4.95 +
4.96 + if isinstance(s, unicode):
4.97 + s = s.encode("utf-8")
4.98 +
4.99 + # Compress the string if requested.
4.100 +
4.101 + if compress:
4.102 + for flag, fn in compressors:
4.103 + cs = fn(s)
4.104 +
4.105 + # Take the first string shorter than the original.
4.106 +
4.107 + if len(cs) < len(s):
4.108 + s = cs
4.109 + break
4.110 + else:
4.111 + flag = "-"
4.112 +
4.113 + else:
4.114 + flag = ""
4.115 +
4.116 + # Write the length of the data before the data itself.
4.117 +
4.118 + length = len(s)
4.119 + self.write(flag + vint(length) + s)
4.120 +
4.121 + # Cache-affected methods.
4.122 +
4.123 + def write(self, s):
4.124 + self.cache.append(s)
4.125 + self.cache_length += len(s)
4.126 + if self.cache_length >= WRITE_CACHE_SIZE:
4.127 + self.flush()
4.128 +
4.129 + def tell(self):
4.130 + return self.f.tell() + self.cache_length
4.131 +
4.132 + def flush(self):
4.133 + self.f.write("".join(self.cache))
4.134 + self.cache = []
4.135 + self.cache_length = 0
4.136 +
4.137 +class FileReader(File):
4.138 +
4.139 + "Reading basic data types from files."
4.140 +
4.141 + def __init__(self, f):
4.142 + File.__init__(self, f)
4.143 + self.reset_cache()
4.144 +
4.145 + def reset_cache(self):
4.146 + self.cache = ""
4.147 + self.cache_length = 0
4.148 + self.cache_start = 0
4.149 +
4.150 + def read_number(self):
4.151 +
4.152 + "Read a number from the file."
4.153 +
4.154 + # Read each byte, adding it to the number.
4.155 +
4.156 + shift = 0
4.157 + number = 0
4.158 + read = self.read
4.159 +
4.160 + try:
4.161 + csd = ord(read(1))
4.162 + while csd & 128:
4.163 + number += ((csd & 127) << shift)
4.164 + shift += 7
4.165 + csd = ord(read(1))
4.166 + else:
4.167 + number += (csd << shift)
4.168 + except TypeError:
4.169 + raise EOFError
4.170 +
4.171 + return number
4.172 +
4.173 + def read_string(self, decompress=0):
4.174 +
4.175 + """
4.176 + Read a string from the file, decompressing the stored data if
4.177 + 'decompress' is set to a true value.
4.178 + """
4.179 +
4.180 + # Decompress the data if requested.
4.181 +
4.182 + if decompress:
4.183 + flag = self.read(1)
4.184 + else:
4.185 + flag = "-"
4.186 +
4.187 + length = self.read_number()
4.188 + s = self.read(length)
4.189 +
4.190 + # Perform decompression if applicable.
4.191 +
4.192 + if flag != "-":
4.193 + fn = decompressors[flag]
4.194 + s = fn(s)
4.195 +
4.196 + # Convert strings to Unicode objects.
4.197 +
4.198 + return unicode(s, "utf-8")
4.199 +
4.200 + # Cache-affected methods.
4.201 +
4.202 + def read(self, n):
4.203 + needed = n - (self.cache_length - self.cache_start)
4.204 +
4.205 + # Read the needed number of characters, if possible.
4.206 +
4.207 + if needed > 0:
4.208 + s = self.f.read(max(needed, READ_CACHE_SIZE))
4.209 + self.cache += s
4.210 + self.cache_length += len(s)
4.211 +
4.212 + # Get the end of the requested block.
4.213 +
4.214 + next_start = self.cache_start + n
4.215 + s = self.cache[self.cache_start:next_start]
4.216 +
4.217 + # Reposition the pointer to the cache.
4.218 +
4.219 + self._seek_cache(len(s))
4.220 + return s
4.221 +
4.222 + def tell(self):
4.223 + return self.f.tell() - self.cache_length + self.cache_start
4.224 +
4.225 + def seek(self, offset):
4.226 + current = self.tell()
4.227 + self.f.seek(offset)
4.228 +
4.229 + # If seeking forward, attempt to navigate the cache.
4.230 +
4.231 + if offset >= current:
4.232 + self._seek_cache(offset - current)
4.233 + else:
4.234 + self.reset_cache()
4.235 +
4.236 + def _seek_cache(self, delta):
4.237 + next_start = self.cache_start + delta
4.238 +
4.239 + if next_start > 0 and next_start >= len(self.cache):
4.240 + self.reset_cache()
4.241 +
4.242 + # If the cache is too big, resize it.
4.243 +
4.244 + elif next_start > READ_CACHE_RESIZE:
4.245 + self.cache = self.cache[next_start:]
4.246 + self.cache_length = len(self.cache)
4.247 + self.cache_start = 0
4.248 +
4.249 + # Otherwise, just reference the next part of the cache.
4.250 +
4.251 + else:
4.252 + self.cache_start = next_start
4.253 +
4.254 +class FileOpener:
4.255 +
4.256 + "Opening files using their filenames."
4.257 +
4.258 + def __init__(self, filename):
4.259 + self.filename = filename
4.260 +
4.261 + def open(self, mode):
4.262 + return open(self.filename, mode)
4.263 +
4.264 + def close(self):
4.265 + pass
4.266 +
4.267 +# vim: tabstop=4 expandtab shiftwidth=4
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
5.2 +++ b/iixr/filesystem.py Tue Sep 15 00:15:11 2009 +0200
5.3 @@ -0,0 +1,129 @@
5.4 +#!/usr/bin/env python
5.5 +
5.6 +"""
5.7 +File access.
5.8 +
5.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
5.10 +
5.11 +This program is free software; you can redistribute it and/or modify it under
5.12 +the terms of the GNU General Public License as published by the Free Software
5.13 +Foundation; either version 3 of the License, or (at your option) any later
5.14 +version.
5.15 +
5.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
5.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
5.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
5.19 +
5.20 +You should have received a copy of the GNU General Public License along
5.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
5.22 +"""
5.23 +
5.24 +from iixr.fields import *
5.25 +from iixr.terms import *
5.26 +from iixr.positions import *
5.27 +from os import remove, rename # partition manipulation
5.28 +from os.path import join
5.29 +
5.30 +# Constants.
5.31 +
5.32 +TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index"
5.33 +FIELD_FILENAMES = "fields", "fields_index"
5.34 +
5.35 +# Utility functions.
5.36 +
5.37 +def get_term_writer(pathname, partition, interval, doc_interval):
5.38 +
5.39 + """
5.40 + Return a term dictionary writer using files under the given 'pathname'
5.41 + labelled according to the given 'partition', using the given indexing
5.42 + 'interval' for terms and 'doc_interval' for document position records.
5.43 + """
5.44 +
5.45 + tdf = open(join(pathname, "terms-%s" % partition), "wb")
5.46 + info_writer = TermWriter(tdf)
5.47 +
5.48 + tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
5.49 + index_writer = TermIndexWriter(tdif)
5.50 +
5.51 + tpf = open(join(pathname, "positions-%s" % partition), "wb")
5.52 + positions_writer = PositionWriter(tpf)
5.53 +
5.54 + tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
5.55 + positions_index_writer = PositionIndexWriter(tpif)
5.56 +
5.57 + positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
5.58 +
5.59 + return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
5.60 +
5.61 +def get_field_writer(pathname, partition, interval):
5.62 +
5.63 + """
5.64 + Return a field dictionary writer using files under the given 'pathname'
5.65 + labelled according to the given 'partition', using the given indexing
5.66 + 'interval'.
5.67 + """
5.68 +
5.69 + ff = open(join(pathname, "fields-%s" % partition), "wb")
5.70 + field_writer = FieldWriter(ff)
5.71 +
5.72 + fif = open(join(pathname, "fields_index-%s" % partition), "wb")
5.73 + field_index_writer = FieldIndexWriter(fif)
5.74 +
5.75 + return FieldDictionaryWriter(field_writer, field_index_writer, interval)
5.76 +
5.77 +def get_term_reader(pathname, partition):
5.78 +
5.79 + """
5.80 + Return a term dictionary reader using files under the given 'pathname'
5.81 + labelled according to the given 'partition'.
5.82 + """
5.83 +
5.84 + tdf = open(join(pathname, "terms-%s" % partition), "rb")
5.85 + info_reader = TermReader(tdf)
5.86 +
5.87 + tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
5.88 + index_reader = TermIndexReader(tdif)
5.89 +
5.90 + positions_opener = PositionOpener(join(pathname, "positions-%s" % partition))
5.91 + positions_index_opener = PositionIndexOpener(join(pathname, "positions_index-%s" % partition))
5.92 +
5.93 + positions_dict_reader = PositionDictionaryReader(positions_opener, positions_index_opener)
5.94 +
5.95 + return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
5.96 +
5.97 +def get_field_reader(pathname, partition):
5.98 +
5.99 + """
5.100 + Return a field dictionary reader using files under the given 'pathname'
5.101 + labelled according to the given 'partition'.
5.102 + """
5.103 +
5.104 + ff = open(join(pathname, "fields-%s" % partition), "rb")
5.105 + field_reader = FieldReader(ff)
5.106 +
5.107 + fif = open(join(pathname, "fields_index-%s" % partition), "rb")
5.108 + field_index_reader = FieldIndexReader(fif)
5.109 +
5.110 + return FieldDictionaryReader(field_reader, field_index_reader)
5.111 +
5.112 +def rename_files(pathname, names, from_partition, to_partition):
5.113 + for name in names:
5.114 + rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
5.115 +
5.116 +def rename_term_files(pathname, from_partition, to_partition):
5.117 + rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
5.118 +
5.119 +def rename_field_files(pathname, from_partition, to_partition):
5.120 + rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
5.121 +
5.122 +def remove_files(pathname, names, partition):
5.123 + for name in names:
5.124 + remove(join(pathname, "%s-%s" % (name, partition)))
5.125 +
5.126 +def remove_term_files(pathname, partition):
5.127 + remove_files(pathname, TERM_FILENAMES, partition)
5.128 +
5.129 +def remove_field_files(pathname, partition):
5.130 + remove_files(pathname, FIELD_FILENAMES, partition)
5.131 +
5.132 +# vim: tabstop=4 expandtab shiftwidth=4
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
6.2 +++ b/iixr/index.py Tue Sep 15 00:15:11 2009 +0200
6.3 @@ -0,0 +1,326 @@
6.4 +#!/usr/bin/env python
6.5 +
6.6 +"""
6.7 +High-level classes.
6.8 +
6.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
6.10 +
6.11 +This program is free software; you can redistribute it and/or modify it under
6.12 +the terms of the GNU General Public License as published by the Free Software
6.13 +Foundation; either version 3 of the License, or (at your option) any later
6.14 +version.
6.15 +
6.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
6.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
6.19 +
6.20 +You should have received a copy of the GNU General Public License along
6.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
6.22 +"""
6.23 +
6.24 +from iixr.filesystem import *
6.25 +from os import listdir, mkdir # index and partition discovery
6.26 +from os.path import exists
6.27 +
6.28 +try:
6.29 + set
6.30 +except NameError:
6.31 + from sets import Set as set
6.32 +
6.33 +# Constants.
6.34 +
6.35 +TERM_INTERVAL = 100
6.36 +DOCUMENT_INTERVAL = 100
6.37 +FIELD_INTERVAL = 100
6.38 +FLUSH_INTERVAL = 10000
6.39 +
6.40 +# High-level classes.
6.41 +
6.42 +class Document:
6.43 +
6.44 + "A container of document information."
6.45 +
6.46 + def __init__(self, docnum):
6.47 + self.docnum = docnum
6.48 + self.fields = []
6.49 + self.terms = {}
6.50 +
6.51 + def add_position(self, term, position):
6.52 +
6.53 + """
6.54 + Add a position entry for the given 'term', indicating the given
6.55 + 'position'.
6.56 + """
6.57 +
6.58 + self.terms.setdefault(term, []).append(position)
6.59 +
6.60 + def add_field(self, identifier, value):
6.61 +
6.62 + "Add a field having the given 'identifier' and 'value'."
6.63 +
6.64 + self.fields.append((identifier, unicode(value))) # convert to string
6.65 +
6.66 + def set_fields(self, fields):
6.67 +
6.68 + """
6.69 + Set the document's 'fields': a list of tuples each containing an integer
6.70 + identifier and a string value.
6.71 + """
6.72 +
6.73 + self.fields = fields
6.74 +
6.75 +class IndexWriter:
6.76 +
6.77 + """
6.78 + Building term information and writing it to the term and field dictionaries.
6.79 + """
6.80 +
6.81 + def __init__(self, pathname, interval, doc_interval, flush_interval):
6.82 + self.pathname = pathname
6.83 + self.interval = interval
6.84 + self.doc_interval = doc_interval
6.85 + self.flush_interval = flush_interval
6.86 +
6.87 + self.dict_partition = 0
6.88 + self.field_dict_partition = 0
6.89 +
6.90 + self.terms = {}
6.91 + self.docs = {}
6.92 +
6.93 + self.doc_counter = 0
6.94 +
6.95 + def add_document(self, doc):
6.96 +
6.97 + """
6.98 + Add the given document 'doc', updating the document counter and flushing
6.99 + terms and fields if appropriate.
6.100 + """
6.101 +
6.102 + for term, positions in doc.terms.items():
6.103 + self.terms.setdefault(term, {})[doc.docnum] = positions
6.104 +
6.105 + self.docs[doc.docnum] = doc.fields
6.106 +
6.107 + self.doc_counter += 1
6.108 + if self.flush_interval and self.doc_counter >= self.flush_interval:
6.109 + self.flush_terms()
6.110 + self.flush_fields()
6.111 + self.doc_counter = 0
6.112 +
6.113 + def get_term_writer(self):
6.114 +
6.115 + "Return a term dictionary writer for the current partition."
6.116 +
6.117 + return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
6.118 +
6.119 + def get_field_writer(self):
6.120 +
6.121 + "Return a field dictionary writer for the current partition."
6.122 +
6.123 + return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
6.124 +
6.125 + def flush_terms(self):
6.126 +
6.127 + "Flush terms into the current term dictionary partition."
6.128 +
6.129 + # Get the terms in order.
6.130 +
6.131 + all_terms = self.terms
6.132 + terms = all_terms.keys()
6.133 + terms.sort()
6.134 +
6.135 + dict_writer = self.get_term_writer()
6.136 +
6.137 + for term in terms:
6.138 + doc_positions = all_terms[term].items()
6.139 + dict_writer.write_term_positions(term, doc_positions)
6.140 +
6.141 + dict_writer.close()
6.142 +
6.143 + self.terms = {}
6.144 + self.dict_partition += 1
6.145 +
6.146 + def flush_fields(self):
6.147 +
6.148 + "Flush fields into the current term dictionary partition."
6.149 +
6.150 + # Get the documents in order.
6.151 +
6.152 + docs = self.docs.items()
6.153 + docs.sort()
6.154 +
6.155 + field_dict_writer = self.get_field_writer()
6.156 +
6.157 + for docnum, fields in docs:
6.158 + field_dict_writer.write_fields(docnum, fields)
6.159 +
6.160 + field_dict_writer.close()
6.161 +
6.162 + self.docs = {}
6.163 + self.field_dict_partition += 1
6.164 +
6.165 + def close(self):
6.166 + if self.terms:
6.167 + self.flush_terms()
6.168 + if self.docs:
6.169 + self.flush_fields()
6.170 +
6.171 +class IndexReader:
6.172 +
6.173 + "Accessing the term and field dictionaries."
6.174 +
6.175 + def __init__(self, pathname):
6.176 + self.dict_reader = get_term_reader(pathname, "merged")
6.177 + self.field_dict_reader = get_field_reader(pathname, "merged")
6.178 +
6.179 + def find_terms(self, term):
6.180 + return self.dict_reader.find_terms(term)
6.181 +
6.182 + def find_positions(self, term):
6.183 + return self.dict_reader.find_positions(term)
6.184 +
6.185 + def get_frequency(self, term):
6.186 + return self.dict_reader.get_frequency(term)
6.187 +
6.188 + def get_document_frequency(self, term):
6.189 + return self.dict_reader.get_document_frequency(term)
6.190 +
6.191 + def get_fields(self, docnum):
6.192 + return self.field_dict_reader.get_fields(docnum)
6.193 +
6.194 + def close(self):
6.195 + self.dict_reader.close()
6.196 + self.field_dict_reader.close()
6.197 +
6.198 +class Index:
6.199 +
6.200 + "An inverted index solution encapsulating the various components."
6.201 +
6.202 + def __init__(self, pathname):
6.203 + self.pathname = pathname
6.204 + self.reader = None
6.205 + self.writer = None
6.206 +
6.207 + def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
6.208 +
6.209 + """
6.210 + Return a writer, optionally using the given indexing 'interval',
6.211 + 'doc_interval' and 'flush_interval'.
6.212 + """
6.213 +
6.214 + if not exists(self.pathname):
6.215 + mkdir(self.pathname)
6.216 +
6.217 + self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
6.218 + return self.writer
6.219 +
6.220 + def get_reader(self, partition=0):
6.221 +
6.222 + "Return a reader for the index."
6.223 +
6.224 + # Ensure that only one partition exists.
6.225 +
6.226 + self.merge()
6.227 + return self._get_reader(partition)
6.228 +
6.229 + def _get_reader(self, partition):
6.230 +
6.231 + "Return a reader for the index."
6.232 +
6.233 + if not exists(self.pathname):
6.234 + raise OSError, "Index path %r does not exist." % self.pathname
6.235 +
6.236 + self.reader = IndexReader(self.pathname)
6.237 + return self.reader
6.238 +
6.239 + def merge(self):
6.240 +
6.241 + "Merge/optimise index partitions."
6.242 +
6.243 + self.merge_terms()
6.244 + self.merge_fields()
6.245 +
6.246 + def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
6.247 +
6.248 + """
6.249 + Merge term dictionaries using the given indexing 'interval' and
6.250 + 'doc_interval'.
6.251 + """
6.252 +
6.253 + readers = []
6.254 + partitions = set()
6.255 +
6.256 + for filename in listdir(self.pathname):
6.257 + if filename.startswith("terms-"): # 6 character prefix
6.258 + partition = filename[6:]
6.259 + readers.append(get_term_reader(self.pathname, partition))
6.260 + partitions.add(partition)
6.261 +
6.262 + # Write directly to a dictionary.
6.263 +
6.264 + if len(readers) > 1:
6.265 + if "merged" in partitions:
6.266 + rename_term_files(self.pathname, "merged", "old-merged")
6.267 + partitions.remove("merged")
6.268 + partitions.add("old-merged")
6.269 +
6.270 + writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
6.271 + merger = TermDictionaryMerger(writer, readers)
6.272 + merger.merge()
6.273 + merger.close()
6.274 +
6.275 + # Remove old files.
6.276 +
6.277 + for partition in partitions:
6.278 + remove_term_files(self.pathname, partition)
6.279 +
6.280 + elif len(readers) == 1:
6.281 + partition = list(partitions)[0]
6.282 + if partition != "merged":
6.283 + rename_term_files(self.pathname, partition, "merged")
6.284 +
6.285 + def merge_fields(self, interval=FIELD_INTERVAL):
6.286 +
6.287 + "Merge field dictionaries using the given indexing 'interval'."
6.288 +
6.289 + readers = []
6.290 + partitions = set()
6.291 +
6.292 + for filename in listdir(self.pathname):
6.293 + if filename.startswith("fields-"): # 7 character prefix
6.294 + partition = filename[7:]
6.295 + readers.append(get_field_reader(self.pathname, partition))
6.296 + partitions.add(partition)
6.297 +
6.298 + # Write directly to a dictionary.
6.299 +
6.300 + if len(readers) > 1:
6.301 + if "merged" in partitions:
6.302 + rename_field_files(self.pathname, "merged", "old-merged")
6.303 + partitions.remove("merged")
6.304 + partitions.add("old-merged")
6.305 +
6.306 + writer = get_field_writer(self.pathname, "merged", interval)
6.307 + merger = FieldDictionaryMerger(writer, readers)
6.308 + merger.merge()
6.309 + merger.close()
6.310 +
6.311 + # Remove old files.
6.312 +
6.313 + for partition in partitions:
6.314 + remove_field_files(self.pathname, partition)
6.315 +
6.316 + elif len(readers) == 1:
6.317 + partition = list(partitions)[0]
6.318 + if partition != "merged":
6.319 + rename_field_files(self.pathname, partition, "merged")
6.320 +
6.321 + def close(self):
6.322 + if self.reader is not None:
6.323 + self.reader.close()
6.324 + self.reader = None
6.325 + if self.writer is not None:
6.326 + self.writer.close()
6.327 + self.writer = None
6.328 +
6.329 +# vim: tabstop=4 expandtab shiftwidth=4
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
7.2 +++ b/iixr/merging.py Tue Sep 15 00:15:11 2009 +0200
7.3 @@ -0,0 +1,74 @@
7.4 +#!/usr/bin/env python
7.5 +
7.6 +"""
7.7 +Dictionary merging classes.
7.8 +
7.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
7.10 +
7.11 +This program is free software; you can redistribute it and/or modify it under
7.12 +the terms of the GNU General Public License as published by the Free Software
7.13 +Foundation; either version 3 of the License, or (at your option) any later
7.14 +version.
7.15 +
7.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
7.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
7.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
7.19 +
7.20 +You should have received a copy of the GNU General Public License along
7.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
7.22 +"""
7.23 +
7.24 +from itermerge import itermerge
7.25 +
7.26 +class Merger:
7.27 +
7.28 + "Merge files."
7.29 +
7.30 + def __init__(self, writer, readers):
7.31 + self.writer = writer
7.32 + self.readers = readers
7.33 +
7.34 + def close(self):
7.35 + for reader in self.readers:
7.36 + reader.close()
7.37 + self.writer.close()
7.38 +
7.39 +class TermDictionaryMerger(Merger):
7.40 +
7.41 + "Merge term and position files."
7.42 +
7.43 + def merge(self):
7.44 +
7.45 + """
7.46 + Merge terms and positions from the readers, sending them to the writer.
7.47 + """
7.48 +
7.49 + last_term = None
7.50 + current_readers = []
7.51 +
7.52 + for term, frequency, doc_frequency, positions in itermerge(self.readers):
7.53 + if term == last_term:
7.54 + current_readers.append(positions)
7.55 + else:
7.56 + if current_readers:
7.57 + self.writer.write_term_positions(last_term, itermerge(current_readers))
7.58 + last_term = term
7.59 + current_readers = [positions]
7.60 + else:
7.61 + if current_readers:
7.62 + self.writer.write_term_positions(last_term, itermerge(current_readers))
7.63 +
7.64 +class FieldDictionaryMerger(Merger):
7.65 +
7.66 + "Merge field files."
7.67 +
7.68 + def merge(self):
7.69 +
7.70 + """
7.71 + Merge fields from the readers, sending them to the writer.
7.72 + """
7.73 +
7.74 + for docnum, fields in itermerge(self.readers):
7.75 + self.writer.write_fields(docnum, fields)
7.76 +
7.77 +# vim: tabstop=4 expandtab shiftwidth=4
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
8.2 +++ b/iixr/positions.py Tue Sep 15 00:15:11 2009 +0200
8.3 @@ -0,0 +1,525 @@
8.4 +#!/usr/bin/env python
8.5 +
8.6 +"""
8.7 +Specific classes for storing position information.
8.8 +
8.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
8.10 +
8.11 +This program is free software; you can redistribute it and/or modify it under
8.12 +the terms of the GNU General Public License as published by the Free Software
8.13 +Foundation; either version 3 of the License, or (at your option) any later
8.14 +version.
8.15 +
8.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
8.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
8.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
8.19 +
8.20 +You should have received a copy of the GNU General Public License along
8.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
8.22 +"""
8.23 +
8.24 +from iixr.files import *
8.25 +from iixr.data import vint
8.26 +
8.27 +class PositionWriter(FileWriter):
8.28 +
8.29 + "Writing position information to files."
8.30 +
8.31 + def reset(self):
8.32 + self.last_docnum = 0
8.33 +
8.34 + def write_positions(self, docnum, positions):
8.35 +
8.36 + """
8.37 + Write for the document 'docnum' the given 'positions'.
8.38 + Return the offset of the written record.
8.39 + """
8.40 +
8.41 + if docnum < self.last_docnum:
8.42 + raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
8.43 +
8.44 + # Record the offset of this record.
8.45 +
8.46 + offset = self.tell()
8.47 +
8.48 + # Make sure that the positions are sorted.
8.49 +
8.50 + positions.sort()
8.51 +
8.52 + # Write the position deltas.
8.53 +
8.54 + output = []
8.55 + last = 0
8.56 +
8.57 + for position in positions:
8.58 + output.append(vint(position - last))
8.59 + last = position
8.60 +
8.61 + # Write the document number delta.
8.62 + # Write the number of positions.
8.63 + # Then write the positions.
8.64 +
8.65 + self.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output))
8.66 +
8.67 + self.last_docnum = docnum
8.68 + return offset
8.69 +
8.70 +class PositionOpener(FileOpener):
8.71 +
8.72 + "Reading position information from files."
8.73 +
8.74 + def read_term_positions(self, offset, count):
8.75 +
8.76 + """
8.77 + Read all positions from 'offset', seeking to that position in the file
8.78 + before reading. The number of documents available for reading is limited
8.79 + to 'count'.
8.80 + """
8.81 +
8.82 + # Duplicate the file handle.
8.83 +
8.84 + f = self.open("rb")
8.85 + return PositionIterator(f, offset, count)
8.86 +
8.87 +class PositionIndexWriter(FileWriter):
8.88 +
8.89 + "Writing position index information to files."
8.90 +
8.91 + def reset(self):
8.92 + self.last_docnum = 0
8.93 + self.last_pos_offset = 0
8.94 +
8.95 + def write_positions(self, docnum, pos_offset, count):
8.96 +
8.97 + """
8.98 + Write the given 'docnum, 'pos_offset' and document 'count' to the
8.99 + position index file.
8.100 + """
8.101 +
8.102 + # Record the offset of this record.
8.103 +
8.104 + offset = self.tell()
8.105 + output = []
8.106 +
8.107 + # Write the document number delta.
8.108 +
8.109 + output.append(vint(docnum - self.last_docnum))
8.110 + self.last_docnum = docnum
8.111 +
8.112 + # Write the position file offset delta.
8.113 +
8.114 + output.append(vint(pos_offset - self.last_pos_offset))
8.115 + self.last_pos_offset = pos_offset
8.116 +
8.117 + # Write the document count.
8.118 +
8.119 + output.append(vint(count))
8.120 +
8.121 + # Actually write the data.
8.122 +
8.123 + self.write("".join(output))
8.124 +
8.125 + return offset
8.126 +
8.127 +class PositionIndexOpener(FileOpener):
8.128 +
8.129 + "Reading position index information from files."
8.130 +
8.131 + def read_term_positions(self, offset, doc_frequency):
8.132 +
8.133 + """
8.134 + Read all positions from 'offset', seeking to that position in the file
8.135 + before reading. The number of documents available for reading is limited
8.136 + to 'doc_frequency'.
8.137 + """
8.138 +
8.139 + # Duplicate the file handle.
8.140 +
8.141 + f = self.open("rb")
8.142 + return PositionIndexIterator(f, offset, doc_frequency)
8.143 +
8.144 +# Iterators for position-related files.
8.145 +
8.146 +class IteratorBase:
8.147 +
8.148 + def __init__(self, count):
8.149 + self.replenish(count)
8.150 +
8.151 + def replenish(self, count):
8.152 + self.count = count
8.153 + self.read_documents = 0
8.154 +
8.155 + def __len__(self):
8.156 + return self.count
8.157 +
8.158 + def sort(self):
8.159 + pass # Stored document positions are already sorted.
8.160 +
8.161 + def __iter__(self):
8.162 + return self
8.163 +
8.164 +class PositionIterator(FileReader, IteratorBase):
8.165 +
8.166 + "Iterating over document positions."
8.167 +
8.168 + def __init__(self, f, offset, count):
8.169 + FileReader.__init__(self, f)
8.170 + IteratorBase.__init__(self, count)
8.171 + self.seek(offset)
8.172 +
8.173 + def reset(self):
8.174 + self.last_docnum = 0
8.175 +
8.176 + def read_positions(self):
8.177 +
8.178 + "Read positions, returning a document number and a list of positions."
8.179 +
8.180 + # Read the document number delta and add it to the last number.
8.181 +
8.182 + self.last_docnum += self.read_number()
8.183 +
8.184 + # Read the number of positions.
8.185 +
8.186 + npositions = self.read_number()
8.187 +
8.188 + # Read the position deltas, adding each previous position to get the
8.189 + # appropriate collection of absolute positions.
8.190 +
8.191 + i = 0
8.192 + last = 0
8.193 + positions = []
8.194 +
8.195 + while i < npositions:
8.196 + last += self.read_number()
8.197 + positions.append(last)
8.198 + i += 1
8.199 +
8.200 + return self.last_docnum, positions
8.201 +
8.202 + def next(self):
8.203 +
8.204 + "Read positions for a single document."
8.205 +
8.206 + if self.read_documents < self.count:
8.207 + self.read_documents += 1
8.208 + return self.read_positions()
8.209 + else:
8.210 + raise StopIteration
8.211 +
8.212 +class PositionIndexIterator(FileReader, IteratorBase):
8.213 +
8.214 + "Iterating over document positions."
8.215 +
8.216 + def __init__(self, f, offset, count):
8.217 + FileReader.__init__(self, f)
8.218 + IteratorBase.__init__(self, count)
8.219 + self.seek(offset)
8.220 + self.section_count = 0
8.221 +
8.222 + def reset(self):
8.223 + self.last_docnum = 0
8.224 + self.last_pos_offset = 0
8.225 +
8.226 + def read_positions(self):
8.227 +
8.228 + """
8.229 + Read a document number, a position file offset for the position index
8.230 + file, and the number of documents in a section of that file.
8.231 + """
8.232 +
8.233 + # Read the document number delta.
8.234 +
8.235 + self.last_docnum += self.read_number()
8.236 +
8.237 + # Read the offset delta.
8.238 +
8.239 + self.last_pos_offset += self.read_number()
8.240 +
8.241 + # Read the document count.
8.242 +
8.243 + count = self.read_number()
8.244 +
8.245 + return self.last_docnum, self.last_pos_offset, count
8.246 +
8.247 + def next(self):
8.248 +
8.249 + "Read positions for a single document."
8.250 +
8.251 + self.read_documents += self.section_count
8.252 + if self.read_documents < self.count:
8.253 + docnum, pos_offset, self.section_count = t = self.read_positions()
8.254 + return t
8.255 + else:
8.256 + raise StopIteration
8.257 +
8.258 +class PositionDictionaryWriter:
8.259 +
8.260 + "Writing position dictionaries."
8.261 +
8.262 + def __init__(self, position_writer, position_index_writer, interval):
8.263 + self.position_writer = position_writer
8.264 + self.position_index_writer = position_index_writer
8.265 + self.interval = interval
8.266 +
8.267 + def write_term_positions(self, doc_positions):
8.268 +
8.269 + """
8.270 + Write all 'doc_positions' - a collection of tuples of the form (document
8.271 + number, position list) - to the file.
8.272 +
8.273 + Add some records to the index, making dictionary entries.
8.274 +
8.275 + Return a tuple containing the offset of the written data, the frequency
8.276 + (number of positions), and document frequency (number of documents) for
8.277 + the term involved.
8.278 + """
8.279 +
8.280 + # Reset the writers.
8.281 +
8.282 + self.position_writer.reset()
8.283 + self.position_index_writer.reset()
8.284 +
8.285 + index_offset = None
8.286 +
8.287 + # Write the positions.
8.288 +
8.289 + frequency = 0
8.290 + first_docnum = None
8.291 + first_offset = None
8.292 + count = 0
8.293 +
8.294 + doc_positions.sort()
8.295 +
8.296 + for docnum, positions in doc_positions:
8.297 + pos_offset = self.position_writer.write_positions(docnum, positions)
8.298 +
8.299 + # Retain the first record offset for a subsequent index entry.
8.300 +
8.301 + if first_offset is None:
8.302 + first_offset = pos_offset
8.303 + first_docnum = docnum
8.304 +
8.305 + frequency += len(positions)
8.306 + count += 1
8.307 +
8.308 + # Every {interval} entries, write an index entry.
8.309 +
8.310 + if count % self.interval == 0:
8.311 + io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
8.312 +
8.313 + # Remember the first index entry offset.
8.314 +
8.315 + if index_offset is None:
8.316 + index_offset = io
8.317 +
8.318 + first_offset = None
8.319 + first_docnum = None
8.320 +
8.321 + # Reset the position writer so that position readers accessing
8.322 + # a section start with the correct document number.
8.323 +
8.324 + self.position_writer.reset()
8.325 +
8.326 + # Finish writing an index entry for the remaining documents.
8.327 +
8.328 + else:
8.329 + if first_offset is not None:
8.330 + io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
8.331 +
8.332 + # Remember the first index entry offset.
8.333 +
8.334 + if index_offset is None:
8.335 + index_offset = io
8.336 +
8.337 + return index_offset, frequency, count
8.338 +
8.339 + def close(self):
8.340 + self.position_writer.close()
8.341 + self.position_index_writer.close()
8.342 +
8.343 +class PositionDictionaryReader:
8.344 +
8.345 + "Reading position dictionaries."
8.346 +
8.347 + def __init__(self, position_opener, position_index_opener):
8.348 + self.position_opener = position_opener
8.349 + self.position_index_opener = position_index_opener
8.350 +
8.351 + def read_term_positions(self, offset, doc_frequency):
8.352 +
8.353 + """
8.354 + Return an iterator for dictionary entries starting at 'offset' with the
8.355 + given 'doc_frequency'.
8.356 + """
8.357 +
8.358 + return PositionDictionaryIterator(self.position_opener,
8.359 + self.position_index_opener, offset, doc_frequency)
8.360 +
8.361 + def close(self):
8.362 + pass
8.363 +
8.364 +class PositionDictionaryIterator:
8.365 +
8.366 + "Iteration over position dictionary entries."
8.367 +
8.368 + def __init__(self, position_opener, position_index_opener, offset, doc_frequency):
8.369 + self.position_opener = position_opener
8.370 + self.doc_frequency = doc_frequency
8.371 + self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency)
8.372 + self.iterator = None
8.373 +
8.374 + # Remember the last values.
8.375 +
8.376 + self.found_docnum, self.found_positions = None, None
8.377 +
8.378 + # Maintain state for the next index entry, if read.
8.379 +
8.380 + self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
8.381 +
8.382 + # Initialise the current index entry and current position file iterator.
8.383 +
8.384 + self._next_section()
8.385 + self._init_section()
8.386 +
8.387 + # Sequence methods.
8.388 +
8.389 + def __len__(self):
8.390 + return self.doc_frequency
8.391 +
8.392 + def sort(self):
8.393 + pass
8.394 +
8.395 + # Iterator methods.
8.396 +
8.397 + def __iter__(self):
8.398 + return self
8.399 +
8.400 + def next(self):
8.401 +
8.402 + """
8.403 + Attempt to get the next document record from the section in the
8.404 + positions file.
8.405 + """
8.406 +
8.407 + # Return any visited but unrequested record.
8.408 +
8.409 + if self.found_docnum is not None:
8.410 + t = self.found_docnum, self.found_positions
8.411 + self.found_docnum, self.found_positions = None, None
8.412 + return t
8.413 +
8.414 + # Or search for the next record.
8.415 +
8.416 + while 1:
8.417 +
8.418 + # Either return the next record.
8.419 +
8.420 + try:
8.421 + return self.iterator.next()
8.422 +
8.423 + # Or, where a section is finished, get the next section and try again.
8.424 +
8.425 + except StopIteration:
8.426 +
8.427 + # Where a section follows, update the index iterator, but keep
8.428 + # reading using the same file iterator (since the data should
8.429 + # just follow on from the last section).
8.430 +
8.431 + self._next_section()
8.432 + self.iterator.replenish(self.section_count)
8.433 +
8.434 + # Reset the state of the iterator to make sure that document
8.435 + # numbers are correct.
8.436 +
8.437 + self.iterator.reset()
8.438 +
8.439 + def from_document(self, docnum):
8.440 +
8.441 + """
8.442 + Attempt to navigate to a positions entry for the given 'docnum',
8.443 + returning the positions for 'docnum', or None otherwise.
8.444 + """
8.445 +
8.446 + # Return any unrequested document positions.
8.447 +
8.448 + if docnum == self.found_docnum:
8.449 + return self.found_positions
8.450 +
8.451 + # Read ahead in the index until the next entry refers to a document
8.452 + # later than the desired document.
8.453 +
8.454 + try:
8.455 + if self.next_docnum is None:
8.456 + self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
8.457 +
8.458 + # Read until the next entry is after the desired document number,
8.459 + # or until the end of the results.
8.460 +
8.461 + while self.next_docnum <= docnum:
8.462 + self._next_read_section()
8.463 + if self.docnum < docnum:
8.464 + self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
8.465 + else:
8.466 + break
8.467 +
8.468 + except StopIteration:
8.469 + pass
8.470 +
8.471 + # Navigate in the position file to the document.
8.472 +
8.473 + self._init_section()
8.474 +
8.475 + try:
8.476 + while 1:
8.477 + found_docnum, found_positions = self.iterator.next()
8.478 +
8.479 + # Return the desired document positions or None (retaining the
8.480 + # positions for the document immediately after).
8.481 +
8.482 + if docnum == found_docnum:
8.483 + return found_positions
8.484 + elif docnum < found_docnum:
8.485 + self.found_docnum, self.found_positions = found_docnum, found_positions
8.486 + return None
8.487 +
8.488 + except StopIteration:
8.489 + return None
8.490 +
8.491 + # Internal methods.
8.492 +
8.493 + def _next_section(self):
8.494 +
8.495 + "Attempt to get the next section in the index."
8.496 +
8.497 + if self.next_docnum is None:
8.498 + self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
8.499 + else:
8.500 + self._next_read_section()
8.501 +
8.502 + def _next_read_section(self):
8.503 +
8.504 + """
8.505 + Make the next index entry the current one without reading from the
8.506 + index.
8.507 + """
8.508 +
8.509 + self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
8.510 + self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
8.511 +
8.512 + def _init_section(self):
8.513 +
8.514 + "Initialise the iterator for the section in the position file."
8.515 +
8.516 + if self.iterator is not None:
8.517 + self.iterator.close()
8.518 + self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)
8.519 +
8.520 + def close(self):
8.521 + if self.iterator is not None:
8.522 + self.iterator.close()
8.523 + self.iterator = None
8.524 + if self.index_iterator is not None:
8.525 + self.index_iterator.close()
8.526 + self.index_iterator = None
8.527 +
8.528 +# vim: tabstop=4 expandtab shiftwidth=4
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
9.2 +++ b/iixr/terms.py Tue Sep 15 00:15:11 2009 +0200
9.3 @@ -0,0 +1,395 @@
9.4 +#!/usr/bin/env python
9.5 +
9.6 +"""
9.7 +Specific classes for storing term information.
9.8 +
9.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
9.10 +
9.11 +This program is free software; you can redistribute it and/or modify it under
9.12 +the terms of the GNU General Public License as published by the Free Software
9.13 +Foundation; either version 3 of the License, or (at your option) any later
9.14 +version.
9.15 +
9.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
9.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
9.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
9.19 +
9.20 +You should have received a copy of the GNU General Public License along
9.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
9.22 +"""
9.23 +
9.24 +from iixr.files import *
9.25 +from os.path import commonprefix # to find common string prefixes
9.26 +from bisect import bisect_right # to find terms in the dictionary index
9.27 +
9.28 +class TermWriter(FileWriter):
9.29 +
9.30 + "Writing term information to files."
9.31 +
9.32 + def reset(self):
9.33 + self.last_term = ""
9.34 + self.last_offset = 0
9.35 +
9.36 + def write_term(self, term, offset, frequency, doc_frequency):
9.37 +
9.38 + """
9.39 + Write the given 'term', its position file 'offset', its 'frequency' and
9.40 + its 'doc_frequency' (number of documents in which it appears) to the
9.41 + term information file. Return the offset after the term information was
9.42 + written to the file.
9.43 + """
9.44 +
9.45 + # Write the prefix length and term suffix.
9.46 +
9.47 + common = len(commonprefix([self.last_term, term]))
9.48 + suffix = term[common:]
9.49 +
9.50 + self.write_number(common)
9.51 + self.write_string(suffix)
9.52 +
9.53 + # Write the offset delta.
9.54 +
9.55 + self.write_number(offset - self.last_offset)
9.56 +
9.57 + # Write the frequency.
9.58 +
9.59 + self.write_number(frequency)
9.60 +
9.61 + # Write the document frequency.
9.62 +
9.63 + self.write_number(doc_frequency)
9.64 +
9.65 + self.last_term = term
9.66 + self.last_offset = offset
9.67 +
9.68 + return self.tell()
9.69 +
9.70 +class TermReader(FileReader):
9.71 +
9.72 + "Reading term information from files."
9.73 +
9.74 + def reset(self):
9.75 + self.last_term = ""
9.76 + self.last_offset = 0
9.77 +
9.78 + def read_term(self):
9.79 +
9.80 + """
9.81 + Read a term, its position file offset, its frequency and its document
9.82 + frequency from the term information file.
9.83 + """
9.84 +
9.85 + # Read the prefix length and term suffix.
9.86 +
9.87 + common = self.read_number()
9.88 + suffix = self.read_string()
9.89 +
9.90 + self.last_term = self.last_term[:common] + suffix
9.91 +
9.92 + # Read the offset delta.
9.93 +
9.94 + self.last_offset += self.read_number()
9.95 +
9.96 + # Read the frequency.
9.97 +
9.98 + frequency = self.read_number()
9.99 +
9.100 + # Read the document frequency.
9.101 +
9.102 + doc_frequency = self.read_number()
9.103 +
9.104 + return self.last_term, self.last_offset, frequency, doc_frequency
9.105 +
9.106 + def go_to_term(self, term, offset, info_offset):
9.107 +
9.108 + """
9.109 + Seek past the entry for 'term' having 'offset' to 'info_offset'. This
9.110 + permits the scanning for later terms from the specified term.
9.111 + """
9.112 +
9.113 + self.seek(info_offset)
9.114 + self.last_term = term
9.115 + self.last_offset = offset
9.116 +
9.117 +class TermIndexWriter(TermWriter):
9.118 +
9.119 + "Writing term dictionary index details to files."
9.120 +
9.121 + def reset(self):
9.122 + TermWriter.reset(self)
9.123 + self.last_info_offset = 0
9.124 +
9.125 + def write_term(self, term, offset, frequency, doc_frequency, info_offset):
9.126 +
9.127 + """
9.128 + Write the given 'term', its position file 'offset', its 'frequency' and
9.129 + its 'doc_frequency' to the term dictionary index file, along with the
9.130 + 'info_offset' in the term information file.
9.131 + """
9.132 +
9.133 + TermWriter.write_term(self, term, offset, frequency, doc_frequency)
9.134 +
9.135 + # Write the information file offset delta.
9.136 +
9.137 + self.write_number(info_offset - self.last_info_offset)
9.138 + self.last_info_offset = info_offset
9.139 +
9.140 +class TermIndexReader(TermReader):
9.141 +
9.142 + "Reading term dictionary index details from files."
9.143 +
9.144 + def reset(self):
9.145 + TermReader.reset(self)
9.146 + self.last_info_offset = 0
9.147 +
9.148 + def read_term(self):
9.149 +
9.150 + """
9.151 + Read a term, its position file offset, its frequency, its document
9.152 + frequency and a term information file offset from the term dictionary
9.153 + index file.
9.154 + """
9.155 +
9.156 + term, offset, frequency, doc_frequency = TermReader.read_term(self)
9.157 +
9.158 + # Read the offset delta.
9.159 +
9.160 + self.last_info_offset += self.read_number()
9.161 +
9.162 + return term, offset, frequency, doc_frequency, self.last_info_offset
9.163 +
9.164 +class TermDictionaryWriter:
9.165 +
9.166 + "Writing term dictionaries."
9.167 +
9.168 + def __init__(self, info_writer, index_writer, position_dict_writer, interval):
9.169 + self.info_writer = info_writer
9.170 + self.index_writer = index_writer
9.171 + self.position_dict_writer = position_dict_writer
9.172 + self.interval = interval
9.173 + self.entry = 0
9.174 +
9.175 + def _write_term(self, term, offset, frequency, doc_frequency):
9.176 +
9.177 + """
9.178 + Write the given 'term', its position file 'offset', its 'frequency' and
9.179 + its 'doc_frequency' (number of documents in which it appears) to the
9.180 + term information file. Return the offset after the term information was
9.181 + written to the file.
9.182 + """
9.183 +
9.184 + info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
9.185 +
9.186 + if self.entry % self.interval == 0:
9.187 + self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
9.188 +
9.189 + self.entry += 1
9.190 +
9.191 + def write_term_positions(self, term, doc_positions):
9.192 +
9.193 + """
9.194 + Write the given 'term' and the 'doc_positions' recording the documents
9.195 + and positions at which the term is found.
9.196 + """
9.197 +
9.198 + offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
9.199 + self._write_term(term, offset, frequency, doc_frequency)
9.200 +
9.201 + def close(self):
9.202 + self.info_writer.close()
9.203 + self.index_writer.close()
9.204 + self.position_dict_writer.close()
9.205 +
9.206 +class TermDictionaryReader:
9.207 +
9.208 + "Reading term dictionaries."
9.209 +
9.210 + def __init__(self, info_reader, index_reader, position_dict_reader):
9.211 + self.info_reader = info_reader
9.212 + self.index_reader = index_reader
9.213 + self.position_dict_reader = position_dict_reader
9.214 +
9.215 + self.terms = []
9.216 + try:
9.217 + while 1:
9.218 + self.terms.append(self.index_reader.read_term())
9.219 + except EOFError:
9.220 + pass
9.221 +
9.222 + # Large numbers for ordering purposes.
9.223 +
9.224 + if self.terms:
9.225 + self.max_offset = self.terms[-1][1] + 1
9.226 + else:
9.227 + self.max_offset = None
9.228 +
9.229 + def _find_closest_entry(self, term):
9.230 +
9.231 + """
9.232 + Find the offsets and frequencies of 'term' from the term dictionary or
9.233 + the closest term starting with the value of 'term'.
9.234 +
9.235 + Return the closest index entry consisting of a term, the position file
9.236 + offset, the term frequency, the document frequency, and the term details
9.237 + file offset.
9.238 + """
9.239 +
9.240 + i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
9.241 +
9.242 + # Get the entry position providing the term or one preceding it.
9.243 + # If no entry precedes the requested term, return the very first entry
9.244 + # as the closest.
9.245 +
9.246 + if i == -1:
9.247 + return self.terms[0]
9.248 + else:
9.249 + return self.terms[i]
9.250 +
9.251 + def _find_closest_term(self, term):
9.252 +
9.253 + """
9.254 + Find the offsets and frequencies of 'term' from the term dictionary or
9.255 + the closest term starting with the value of 'term'.
9.256 +
9.257 + Return the closest term (or the term itself), the position file offset,
9.258 + the term frequency, the document frequency, and the term details file
9.259 + offset (or None if the reader is already positioned).
9.260 + """
9.261 +
9.262 + found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
9.263 +
9.264 + # Where the term is found immediately, return the offset and
9.265 + # frequencies. If the term does not appear, return the details of the
9.266 + # closest entry.
9.267 +
9.268 + if term <= found_term:
9.269 + return found_term, offset, frequency, doc_frequency, info_offset
9.270 +
9.271 + # Otherwise, seek past the index term's entry in the information file
9.272 + # and scan for the desired term.
9.273 +
9.274 + else:
9.275 + self.info_reader.go_to_term(found_term, offset, info_offset)
9.276 + try:
9.277 + while term > found_term:
9.278 + found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
9.279 + except EOFError:
9.280 + pass
9.281 +
9.282 + return found_term, offset, frequency, doc_frequency, None
9.283 +
9.284 + def _find_term(self, term):
9.285 +
9.286 + """
9.287 + Find the position file offset and frequency of 'term' from the term
9.288 + dictionary.
9.289 + """
9.290 +
9.291 + found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
9.292 +
9.293 + # If the term is found, return the offset and frequencies.
9.294 +
9.295 + if term == found_term:
9.296 + return offset, frequency, doc_frequency
9.297 + else:
9.298 + return None
9.299 +
9.300 + def _get_positions(self, offset, doc_frequency):
9.301 + return self.position_dict_reader.read_term_positions(offset, doc_frequency)
9.302 +
9.303 + # Iterator convenience methods.
9.304 +
9.305 + def __iter__(self):
9.306 + self.rewind()
9.307 + return self
9.308 +
9.309 + def next(self):
9.310 + try:
9.311 + return self.read_term()
9.312 + except EOFError:
9.313 + raise StopIteration
9.314 +
9.315 + # Sequential access methods.
9.316 +
9.317 + def rewind(self):
9.318 + self.info_reader.rewind()
9.319 +
9.320 + def read_term(self):
9.321 +
9.322 + """
9.323 + Return the next term, its frequency, its document frequency, and the
9.324 + documents and positions at which the term is found.
9.325 + """
9.326 +
9.327 + term, offset, frequency, doc_frequency = self.info_reader.read_term()
9.328 + positions = self._get_positions(offset, doc_frequency)
9.329 + return term, frequency, doc_frequency, positions
9.330 +
9.331 + # Query methods.
9.332 +
9.333 + def find_terms(self, term):
9.334 +
9.335 + "Return all terms whose values start with the value of 'term'."
9.336 +
9.337 + terms = []
9.338 +
9.339 + found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
9.340 +
9.341 + # Position the reader, if necessary.
9.342 +
9.343 + if info_offset is not None:
9.344 + self.info_reader.go_to_term(found_term, offset, info_offset)
9.345 +
9.346 + # Read and record terms.
9.347 +
9.348 + try:
9.349 + # Add the found term if it starts with the specified term.
9.350 +
9.351 + while found_term.startswith(term):
9.352 + terms.append(found_term)
9.353 + found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
9.354 +
9.355 + except EOFError:
9.356 + pass
9.357 +
9.358 + return terms
9.359 +
9.360 + def find_positions(self, term):
9.361 +
9.362 + "Return the documents and positions at which the given 'term' is found."
9.363 +
9.364 + t = self._find_term(term)
9.365 + if t is None:
9.366 + return None
9.367 + else:
9.368 + offset, frequency, doc_frequency = t
9.369 + return self._get_positions(offset, doc_frequency)
9.370 +
9.371 + def get_frequency(self, term):
9.372 +
9.373 + "Return the frequency of the given 'term'."
9.374 +
9.375 + t = self._find_term(term)
9.376 + if t is None:
9.377 + return None
9.378 + else:
9.379 + offset, frequency, doc_frequency = t
9.380 + return frequency
9.381 +
9.382 + def get_document_frequency(self, term):
9.383 +
9.384 + "Return the document frequency of the given 'term'."
9.385 +
9.386 + t = self._find_term(term)
9.387 + if t is None:
9.388 + return None
9.389 + else:
9.390 + offset, frequency, doc_frequency = t
9.391 + return doc_frequency
9.392 +
9.393 + def close(self):
9.394 + self.info_reader.close()
9.395 + self.index_reader.close()
9.396 + self.position_dict_reader.close()
9.397 +
9.398 +# vim: tabstop=4 expandtab shiftwidth=4
10.1 --- a/setup.py Mon Sep 14 21:23:32 2009 +0200
10.2 +++ b/setup.py Tue Sep 15 00:15:11 2009 +0200
10.3 @@ -11,6 +11,7 @@
10.4 author_email = "paul@boddie.org.uk",
10.5 url = "http://www.boddie.org.uk/python/iixr.html",
10.6 version = "0.1",
10.7 - py_modules = ["iixr", "itermerge"],
10.8 + py_modules = ["itermerge"],
10.9 + packages = ["iixr"],
10.10 ext_modules = [vint],
10.11 )
11.1 --- a/test.py Mon Sep 14 21:23:32 2009 +0200
11.2 +++ b/test.py Tue Sep 15 00:15:11 2009 +0200
11.3 @@ -1,6 +1,10 @@
11.4 #!/usr/bin/env python
11.5
11.6 -import iixr
11.7 +from iixr.files import *
11.8 +from iixr.fields import *
11.9 +from iixr.terms import *
11.10 +from iixr.positions import *
11.11 +from iixr.index import *
11.12 import os
11.13
11.14 # Remove old test files.
11.15 @@ -23,13 +27,13 @@
11.16 numbers = [12345678, 0, 1, 127, 128, 255, 256]
11.17
11.18 f = open("test", "wb")
11.19 -w = iixr.FileWriter(f)
11.20 +w = FileWriter(f)
11.21 for number in numbers:
11.22 w.write_number(number)
11.23 w.close()
11.24
11.25 f = open("test", "rb")
11.26 -r = iixr.FileReader(f)
11.27 +r = FileReader(f)
11.28 for number in numbers:
11.29 n = r.read_number()
11.30 print number == n, number, n
11.31 @@ -52,7 +56,7 @@
11.32 ]
11.33
11.34 f = open("testP", "wb")
11.35 -w = iixr.PositionWriter(f)
11.36 +w = PositionWriter(f)
11.37 for doc_positions in all_doc_positions:
11.38 for docnum, positions in doc_positions:
11.39 w.write_positions(docnum, positions)
11.40 @@ -60,7 +64,7 @@
11.41 w.close()
11.42
11.43 f = open("testP", "rb")
11.44 -r = iixr.PositionIterator(f, 0, None)
11.45 +r = PositionIterator(f, 0, None)
11.46 for doc_positions in all_doc_positions:
11.47 for docnum, positions in doc_positions:
11.48 d, p = r.read_positions()
11.49 @@ -84,7 +88,7 @@
11.50
11.51 offsets = []
11.52 f = open("testPI", "wb")
11.53 -w = iixr.PositionIndexWriter(f)
11.54 +w = PositionIndexWriter(f)
11.55 for term_positions in indexed_positions:
11.56 offset = None
11.57 doc_frequency = 0
11.58 @@ -97,7 +101,7 @@
11.59 offsets.append((offset, doc_frequency))
11.60 w.close()
11.61
11.62 -r = iixr.PositionIndexOpener("testPI")
11.63 +r = PositionIndexOpener("testPI")
11.64 offsets.reverse()
11.65 indexed_positions.reverse()
11.66 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
11.67 @@ -111,19 +115,19 @@
11.68 # Test position dictionaries.
11.69
11.70 f = open("testP", "wb")
11.71 -w = iixr.PositionWriter(f)
11.72 +w = PositionWriter(f)
11.73 f2 = open("testPI", "wb")
11.74 -w2 = iixr.PositionIndexWriter(f2)
11.75 -wd = iixr.PositionDictionaryWriter(w, w2, 2)
11.76 +w2 = PositionIndexWriter(f2)
11.77 +wd = PositionDictionaryWriter(w, w2, 2)
11.78 offsets = []
11.79 for doc_positions in all_doc_positions:
11.80 offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
11.81 offsets.append((offset, doc_frequency))
11.82 wd.close()
11.83
11.84 -r = iixr.PositionOpener("testP")
11.85 -r2 = iixr.PositionIndexOpener("testPI")
11.86 -rd = iixr.PositionDictionaryReader(r, r2)
11.87 +r = PositionOpener("testP")
11.88 +r2 = PositionIndexOpener("testPI")
11.89 +rd = PositionDictionaryReader(r, r2)
11.90 offsets.reverse()
11.91 all_doc_positions.reverse()
11.92 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
11.93 @@ -144,13 +148,13 @@
11.94 ]
11.95
11.96 f = open("testF", "wb")
11.97 -w = iixr.FieldWriter(f)
11.98 +w = FieldWriter(f)
11.99 for docnum, fields in doc_fields:
11.100 w.write_fields(docnum, list(enumerate(fields)))
11.101 w.close()
11.102
11.103 f = open("testF", "rb")
11.104 -r = iixr.FieldReader(f)
11.105 +r = FieldReader(f)
11.106 for docnum, fields in doc_fields:
11.107 dn, df = r.read_fields()
11.108 print docnum == dn, docnum, dn
11.109 @@ -166,13 +170,13 @@
11.110 ]
11.111
11.112 f = open("testFI", "wb")
11.113 -w = iixr.FieldIndexWriter(f)
11.114 +w = FieldIndexWriter(f)
11.115 for docnum, offset in indexed_docs:
11.116 w.write_document(docnum, offset)
11.117 w.close()
11.118
11.119 f = open("testFI", "rb")
11.120 -r = iixr.FieldIndexReader(f)
11.121 +r = FieldIndexReader(f)
11.122 for docnum, offset in indexed_docs:
11.123 dn, o = r.read_document()
11.124 print docnum == dn, docnum, dn
11.125 @@ -182,19 +186,19 @@
11.126 # Test field dictionaries.
11.127
11.128 f = open("testF", "wb")
11.129 -w = iixr.FieldWriter(f)
11.130 +w = FieldWriter(f)
11.131 f2 = open("testFI", "wb")
11.132 -w2 = iixr.FieldIndexWriter(f2)
11.133 -wd = iixr.FieldDictionaryWriter(w, w2, 3)
11.134 +w2 = FieldIndexWriter(f2)
11.135 +wd = FieldDictionaryWriter(w, w2, 3)
11.136 for docnum, fields in doc_fields:
11.137 wd.write_fields(docnum, list(enumerate(fields)))
11.138 wd.close()
11.139
11.140 f = open("testF", "rb")
11.141 -r = iixr.FieldReader(f)
11.142 +r = FieldReader(f)
11.143 f2 = open("testFI", "rb")
11.144 -r2 = iixr.FieldIndexReader(f2)
11.145 -rd = iixr.FieldDictionaryReader(r, r2)
11.146 +r2 = FieldIndexReader(f2)
11.147 +rd = FieldDictionaryReader(r, r2)
11.148 doc_fields_reversed = doc_fields[:]
11.149 doc_fields_reversed.reverse()
11.150 for docnum, fields in doc_fields_reversed:
11.151 @@ -226,13 +230,13 @@
11.152 ]
11.153
11.154 f = open("test", "wb")
11.155 -w = iixr.TermWriter(f)
11.156 +w = TermWriter(f)
11.157 for term, offset, frequency, doc_frequency in terms:
11.158 w.write_term(term, offset, frequency, doc_frequency)
11.159 w.close()
11.160
11.161 f = open("test", "rb")
11.162 -r = iixr.TermReader(f)
11.163 +r = TermReader(f)
11.164 for term, offset, frequency, doc_frequency in terms:
11.165 t, o, fr, df = r.read_term()
11.166 print term == t, term, t
11.167 @@ -254,13 +258,13 @@
11.168 ]
11.169
11.170 f = open("test", "wb")
11.171 -w = iixr.TermIndexWriter(f)
11.172 +w = TermIndexWriter(f)
11.173 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
11.174 w.write_term(term, offset, frequency, doc_frequency, info_offset)
11.175 w.close()
11.176
11.177 f = open("test", "rb")
11.178 -r = iixr.TermIndexReader(f)
11.179 +r = TermIndexReader(f)
11.180 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
11.181 t, o, fr, df, i = r.read_term()
11.182 print term == t, term, t
11.183 @@ -273,27 +277,27 @@
11.184 # Test dictionaries with only term data.
11.185
11.186 f = open("test", "wb")
11.187 -w = iixr.TermWriter(f)
11.188 +w = TermWriter(f)
11.189 f2 = open("testI", "wb")
11.190 -w2 = iixr.TermIndexWriter(f2)
11.191 +w2 = TermIndexWriter(f2)
11.192 f3 = open("testP", "wb")
11.193 -w3 = iixr.PositionWriter(f3)
11.194 +w3 = PositionWriter(f3)
11.195 f4 = open("testPI", "wb")
11.196 -w4 = iixr.PositionIndexWriter(f4)
11.197 -wp = iixr.PositionDictionaryWriter(w3, w4, 2)
11.198 -wd = iixr.TermDictionaryWriter(w, w2, wp, 3)
11.199 +w4 = PositionIndexWriter(f4)
11.200 +wp = PositionDictionaryWriter(w3, w4, 2)
11.201 +wd = TermDictionaryWriter(w, w2, wp, 3)
11.202 for term, offset, frequency, doc_frequency in terms:
11.203 wd._write_term(term, offset, frequency, doc_frequency)
11.204 wd.close()
11.205
11.206 f = open("test", "rb")
11.207 -r = iixr.TermReader(f)
11.208 +r = TermReader(f)
11.209 f2 = open("testI", "rb")
11.210 -r2 = iixr.TermIndexReader(f2)
11.211 -r3 = iixr.PositionOpener("testP")
11.212 -r4 = iixr.PositionIndexOpener("testPI")
11.213 -rp = iixr.PositionDictionaryReader(r3, r4)
11.214 -rd = iixr.TermDictionaryReader(r, r2, rp)
11.215 +r2 = TermIndexReader(f2)
11.216 +r3 = PositionOpener("testP")
11.217 +r4 = PositionIndexOpener("testPI")
11.218 +rp = PositionDictionaryReader(r3, r4)
11.219 +rd = TermDictionaryReader(r, r2, rp)
11.220 terms_reversed = terms[:]
11.221 terms_reversed.reverse()
11.222 for term, offset, frequency, doc_frequency in terms_reversed:
11.223 @@ -335,27 +339,27 @@
11.224 ]
11.225
11.226 f = open("test", "wb")
11.227 -w = iixr.TermWriter(f)
11.228 +w = TermWriter(f)
11.229 f2 = open("testI", "wb")
11.230 -w2 = iixr.TermIndexWriter(f2)
11.231 +w2 = TermIndexWriter(f2)
11.232 f3 = open("testP", "wb")
11.233 -w3 = iixr.PositionWriter(f3)
11.234 +w3 = PositionWriter(f3)
11.235 f4 = open("testPI", "wb")
11.236 -w4 = iixr.PositionIndexWriter(f4)
11.237 -wp = iixr.PositionDictionaryWriter(w3, w4, 2)
11.238 -wd = iixr.TermDictionaryWriter(w, w2, wp, 3)
11.239 +w4 = PositionIndexWriter(f4)
11.240 +wp = PositionDictionaryWriter(w3, w4, 2)
11.241 +wd = TermDictionaryWriter(w, w2, wp, 3)
11.242 for term, doc_positions in terms_with_positions:
11.243 wd.write_term_positions(term, doc_positions)
11.244 wd.close()
11.245
11.246 f = open("test", "rb")
11.247 -r = iixr.TermReader(f)
11.248 +r = TermReader(f)
11.249 f2 = open("testI", "rb")
11.250 -r2 = iixr.TermIndexReader(f2)
11.251 -r3 = iixr.PositionOpener("testP")
11.252 -r4 = iixr.PositionIndexOpener("testPI")
11.253 -rp = iixr.PositionDictionaryReader(r3, r4)
11.254 -rd = iixr.TermDictionaryReader(r, r2, rp)
11.255 +r2 = TermIndexReader(f2)
11.256 +r3 = PositionOpener("testP")
11.257 +r4 = PositionIndexOpener("testPI")
11.258 +rp = PositionDictionaryReader(r3, r4)
11.259 +rd = TermDictionaryReader(r, r2, rp)
11.260 terms_reversed = terms_with_positions[:]
11.261 terms_reversed.reverse()
11.262 for term, doc_positions in terms_reversed:
11.263 @@ -407,10 +411,10 @@
11.264 ("shells", 37, None)
11.265 ]
11.266
11.267 -index = iixr.Index("test_index")
11.268 +index = Index("test_index")
11.269 wi = index.get_writer(3, 2, 6)
11.270 for docnum, text in docs:
11.271 - doc = iixr.Document(docnum)
11.272 + doc = Document(docnum)
11.273 for position, term in enumerate(text.split()):
11.274 doc.add_position(term, position)
11.275 doc.add_field(123, text)