# # Isearch search algorithm implemented in Perl # # Brent Baccala # June 2001 # # Isearch (from www.etymon.com) is a full-text search engine. This # module implements the core features of searching an Isearch index. # # What's not supported: # - field searches don't work # - no date/numeric/bounding box searches # - TITLEs are not retrieved from HTML documents # - no multi-term or boolean searches # # $Log: Isearch.pm,v $ # Revision 1.16 2002/05/11 18:39:55 baccala # Fixed handling of :3 type weights at end of search terms # # Revision 1.15 2002/05/11 18:29:52 baccala # Fixed handling of asterisk in search terms # # Revision 1.14 2002/05/05 21:11:17 baccala # Added ability to set debug level via an environment var # # Revision 1.13 2002/05/05 20:58:05 baccala # Added support for big endian architectures (like the sparc I'm now running # it on). Checks DBI file for architecture type database was written on, # and adjusts it's pack templates accordingly. # # Revision 1.12 2002/05/05 20:27:26 baccala # Switched to using Freesoft's scoring mechanism # # Revision 1.11 2002/01/24 18:46:26 baccala # added openMDT function # fixed typo exists/defined # # Revision 1.10 2001/08/04 03:24:24 baccala # Renamed package from Isearch2 to Isearch # # Revision 1.9 2001/07/04 19:43:45 baccala # Added stop word list from Isearch # # Revision 1.8 2001/07/04 19:00:39 baccala # Corrected slight problem with limit finding that could, in a rare case, # drop a single valid entry at the end of the range # # Revision 1.7 2001/07/02 19:43:56 baccala # Revised version works more like C Isearch - binary searches MDT on disk, # rather than reading the whole thing into memory. # # Revision 1.6 2001/07/02 17:02:37 baccala # &fetchBuffer now returns undef if it can't open a file instead of killing # the whole script # # Revision 1.5 2001/06/21 06:36:32 baccala # Dropped use of Tree::RedBlack; now read in raw MDT # Fixed a slight bug in the binary search algorithm # # Revision 1.4 2001/06/21 05:11:20 baccala # Added recursive decent parser (working, but not yet complete) to # parse boolean search expressions # # Revision 1.3 2001/06/07 03:47:26 baccala # Handles deleted documents, and that means it handles changed documents (yea!) # package Isearch; use strict; use Data::Dumper; my $debug = 0; if (exists $ENV{ISEARCH_DEBUG}) { $debug = $ENV{ISEARCH_DEBUG}; } # Isearch indexes documents using a single global index (32 bits), # which maps to character positions in the files. To resolve a global # index, you look it up in the Master Document Table (MDT) which lists # initial and final global indices for each file in the collection, # find the right file, then subtract the file's initial index to get a # character offset in the file. Ex: index.htm is 400 bytes long and # maps to global indices 100001-100400. So global index 100200 # corresponds to the 200th byte in index.htm. The main index is just # a sorted list of global indices for every word in the collection; # you binary search it. # &lookupGP - Find a global position (GP) in the MDT and returns its # MDT entry. Assumes MDT is already open on filehandle MDT, and # number of records is in $mdtsize. Assumes we're using an index # built with Isearch v1.47, which writes 1380-byte MDT records using a # C structure. my $mdtsize; my %DBI; sub openMDT { my ($db) = @_; my $key; print "reading DBI for $db\n" if $debug>0; open(DBI,"<$db.dbi") or die "opening $db.dbi"; while () { if (/^ \+(.*)/) { $key = $1; } if (/^ \+(.*)/) { $DBI{$key} = $1; print "read DBI $key -> $1\n" if $debug>1; } } close(DBI); print "reading MDT for $db\n" if $debug>0; open(MDT, "<$db.mdt") or die "opening $db.mdt"; binmode MDT; my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,$blksize,$blocks) = stat(MDT) or die "stat MDT"; die "Invalid sized MDT for $db\n" if ($size%1380 != 0); $mdtsize = $size/1380; } sub lookupGP { my ($gp) = @_; my $pack_template; if ($DBI{BigEndian}) { $pack_template = 'Z16Z64Z255Z1024xNNNNCxxx'; } else { $pack_template = 'Z16Z64Z255Z1024xVVVVCxxx'; } my $left = 0; my $right = $mdtsize - 1; while ($left <= $right) { my $mid = int(($left + $right)/2); my $mdtentry; seek(MDT, 1380 * $mid, 0); read(MDT, $mdtentry, 1380); my ($dockey, $doctype, $pathname, $filename, $gfstart, $gfend, $lrstart, $lrend, $deleted) = unpack($pack_template, $mdtentry); if ($gp >= $gfstart and $gp <= $gfend) { my $value = { DocKey => $dockey, DocType => $doctype, Pathname => $pathname, Filename => $filename, GFstart => $gfstart, GFend => $gfend, LRstart => $lrstart, LRend => $lrend, Deleted => $deleted }; return $value; } elsif ($gp < $gfstart) { $right = $mid-1; } else { $left = $mid+1; } } return undef; } my $readlen = 32; my $exact = 0; # &fetchBuffer($db, $inx) retrieves one word from the document # collection, from the $inx'th position in the index file, and returns # it. Can return undef if the document was marked deleted. The index # file is assumed to be opened on file handle INX. Since the index # file is just a sorted list of 32-bit global indices, we seek to # 4*$inx in the index file, read the global index we find there, look # it up in the MDT, open the file pointed to by the MDT entry, seek # into that file, and read up to $readlen bytes from the indicated # position. Unless we're doing an exact match, we then throw away # everything after the first non-word-constituent character and return # a lowercased version of the word. sub fetchBuffer { my ($db, $inx) = @_; my $buffer; my $gpos; seek(INX, 4*$inx, 0); read(INX, $buffer, 4); if ($DBI{BigEndian}) { $gpos = unpack('N', $buffer); } else { $gpos = unpack('V', $buffer); } my $mdtentry = &lookupGP($gpos); die "mdtentry undef" unless defined $mdtentry; return undef if $$mdtentry{Deleted}; open(FILE, $$mdtentry{Pathname} . $$mdtentry{Filename}) or return undef; binmode FILE; seek(FILE, $gpos - $$mdtentry{GFstart}, 0); read(FILE, $buffer, $readlen); close FILE; if ($exact) { $buffer =~ s/[\W_]$//; $buffer =~ s/[\W_]/ /g; } else { $buffer =~ s/[\W_].*//s; } #print "$inx($gpos) = $buffer\n"; return lc $buffer; } # &fetchBufferNearestMid - retrieves one word from the document collection # near the midpoint of index positions ($left, $right). Uses &fetchBuffer, # and takes into consideration that the actual midpoint might correspond # to a deleted document. Constructs a range around the midpoint containing # deleted entries and bracketed on each side by either a legitimate hit # or the limit of the original range. Returns a four element list - # left word, left index, right word, right index. Each word will be # undef if everything on that side up to the initial endpoint was deleted. sub fetchBufferNearestMid { my ($db, $left, $right) = @_; my $mid = int(($right + $left)/2); my $buffer = &fetchBuffer($db, $mid); if (defined $buffer) { print "fetchBufferNearestMid returns ($buffer, $mid)\n" if $debug>1; return ($buffer, $mid, $buffer, $mid); } my ($bufferl, $midl, $bufferr, $midr); for ($midl = $mid-1; $midl >= $left; $midl --) { $bufferl = &fetchBuffer($db, $midl); last if defined $bufferl; } for ($midr = $mid+1; $midr <= $right; $midr ++) { $bufferr = &fetchBuffer($db, $midr); last if defined $bufferr; } print "fetchBufferNearestMid returns ($bufferl, $midl, $bufferr, $midr)\n" if $debug>1; return ($bufferl, $midl, $bufferr, $midr); } # &binarySearch - uses &fetchBufferNearestMid (so index file must be # open on INX) to binary search an index file. If a hit was found, # immediately returns a reference to a three-item list: [$left, $hit, # $right], where $hit is the index of the hit, while $left and $right # bracket it, since there might be more than one matching index. # $left is always less than the lowest matching index, while $right is # always greater than the highest matching index. sub binarySearch { my ($db, $searchWord, $left, $right) = @_; while ($left <= $right) { print "binarySearch($searchWord) [$left, $right]\n" if $debug>1; my ($bufferl, $midl, $bufferr, $midr) = &fetchBufferNearestMid($db, $left, $right); if (defined $bufferl and $bufferl eq $searchWord) { return [$left, $midl, $right]; } elsif (defined $bufferr and $bufferr eq $searchWord) { return [$left, $midr, $right]; } elsif (defined $bufferr and $bufferr lt $searchWord) { $left = $midr+1; } elsif (defined $bufferl and $bufferl gt $searchWord) { $right = $midl-1; } else { # by this point... # if defined $bufferl, $bufferl lt $searchWord # if defined $bufferr, $bufferr gt $searchWord # ...so $searchWord must be in the range of deleted entries return undef; } } return undef; } # &limitSearchLeft - called after &binarySearch to find the left # boundary of the range of matching indices. Returns the lowest index # that matches $searchWord. None of the indices in the passed-in # range should be gt $searchWord, so we're just looking for the # boundary between "lt" (on the left) and "eq" (on the right). Uses # &fetchBuffer, so index file must still be open on INX. sub limitSearchLeft { my ($db, $searchWord, $left, $right) = @_; while ($left+1 < $right) { print "limitSearchLeft($searchWord) [$left, $right]\n" if $debug>1; my ($bufferl, $midl, $bufferr, $midr) = &fetchBufferNearestMid($db, $left, $right); if (defined $bufferl and $bufferl eq $searchWord) { $right = $midl; } elsif (defined $bufferr and $bufferr lt $searchWord) { $left = $midr+1; } else { # by this point... # if defined $bufferl, $bufferl lt $searchWord # if defined $bufferr, $bufferr eq $searchWord # ...so the boundary must be in the range of deleted entries, # and the returned right midpoint is the lowest matching index return $midr if defined $bufferr; # The passed-in right boundary should have been a legitimate # hit returned by &binarySearch, so... die "\$bufferr should never be undef in limitSearchLeft"; } } my ($bufferl, $midl, $bufferr, $midr) = &fetchBufferNearestMid($db, $left, $left); return $left if (defined $bufferl and $bufferl eq $searchWord); return $right; } # &limitSearchRight - same deal for finding the right limit of # the matching indices sub limitSearchRight { my ($db, $searchWord, $left, $right) = @_; while ($left+1 < $right) { print "limitSearchRight($searchWord) [$left, $right]\n" if $debug>1; my ($bufferl, $midl, $bufferr, $midr) = &fetchBufferNearestMid($db, $left, $right); if (defined $bufferl and $bufferl gt $searchWord) { $right = $midl-1; } elsif (defined $bufferr and $bufferr eq $searchWord) { $left = $midr; } else { return $midl if defined $bufferl; die "\$bufferl should never be undef in limitSearchRight"; } } my ($bufferl, $midl, $bufferr, $midr) = &fetchBufferNearestMid($db, $right, $right); return $right if (defined $bufferr and $bufferr eq $searchWord); return $left; } # &searchTermINX - searches for a particular term in a particular index # and returns a list of matching global positions (possibly the empty list). # # Opens the index (passed in), &binarySearch's for the term, then uses # &limitSearch's to find the left and right boundaries. Reads from the # index file everything between the boundaries (these will be the # global positions that matched), unpacks this into a list and returns it. sub searchTermINX { my ($db, $index, $searchWord) = @_; open(INX, $index) or die "opening $index"; binmode INX; my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,$blksize,$blocks) = stat(INX) or die "stat INX"; my $result = &binarySearch($db, $searchWord, 0, int($size/4)); if (not defined $result) { close INX; return (); } my $left = &limitSearchLeft($db, $searchWord, $$result[0], $$result[1]); my $right = &limitSearchRight($db, $searchWord, $$result[1], $$result[2]); print "$index: [$left, $right]\n" if $debug>0; my $buffer; seek(INX, 4*$left, 0); read(INX, $buffer, 4*($right-$left+1)) == 4*($right-$left+1) or die "short read on INX"; close (INX); if ($DBI{BigEndian}) { return unpack('N*', $buffer); } else { return unpack('V*', $buffer); } } # &searchTermPrim - searches for a word in a database and returns a list # of matching global positions. If the database hasn't been optimized, # then there might be multiple index files. Calls &searchTermINX for # every index file and concatenates all the results. Note that the # result set may contains global positions pointing to deleted files; # the MDT still needs to be checked for each result to weed out deletions. sub searchTermPrim { my ($db, $searchWord) = @_; if ($searchWord =~ s/\*$//) { # Wildcard - throw away * (the s did) and only match first n letters $readlen = length($searchWord); } else { # No wildcard - match n + 1 letters to make sure the word ends $readlen = length($searchWord) + 1; } $searchWord = lc $searchWord; $searchWord =~ s/[\W_]/ /g; $exact = 1; if (not -r "$db.inx.1") { return &searchTermINX($db, "$db.inx", $searchWord); } else { return map { &searchTermINX($db, $_, $searchWord) } (glob "$db.inx.*"); } } # &searchTerm - searches for a word in a database and returns a hash # reference. The hash keys are filenames and the values are hit # counts, normalized by dividing through by the square root of the sum # of the squares. sub searchTerm { my ($db, $searchWord) = @_; my @results; my %SCORES; my $mdtentry; my $weight; if ($searchWord =~ s/:([0-9]+)$//) { $weight = $1; } else { $weight = 1; } @results = &searchTermPrim($db, $searchWord); foreach my $gp (sort {$a <=> $b} @results) { if ((not defined $mdtentry) or $gp > $$mdtentry{GFend}) { $mdtentry = &lookupGP($gp); } next if $$mdtentry{Deleted}; # Standard Isearch scoring mechanism - scores all hits the same #$SCORES{$$mdtentry{Pathname}.$$mdtentry{Filename}} ++; # Freesoft's experimental scoring mechanism - score hits inversely # proportional to the size of the file $SCORES{$$mdtentry{Pathname}.$$mdtentry{Filename}} += (1 / $$mdtentry{LRend}); } my $sumsq = 0; foreach my $file (keys %SCORES) { $sumsq += $SCORES{$file} * $SCORES{$file}; } my $sqrtsum = sqrt($sumsq); foreach my $file (keys %SCORES) { $SCORES{$file} /= $sqrtsum; $SCORES{$file} *= $weight; } return \%SCORES; } sub searchExpr { my ($db, @searchPhrase) = @_; my $result; ($result, @searchPhrase) = &searchExpr2($db, @searchPhrase); while (@searchPhrase > 0 and ($searchPhrase[0] eq "or" or $searchPhrase[0] eq "||")) { my $result2; shift @searchPhrase; ($result2, @searchPhrase) = &searchExpr2($db, @searchPhrase); foreach my $file (keys %$result2) { $$result{$file} += $$result2{$file}; } } return $result, @searchPhrase; } sub searchExpr2 { my ($db, @searchPhrase) = @_; my $result; ($result, @searchPhrase) = &searchExpr3($db, @searchPhrase); while (@searchPhrase > 0 and ($searchPhrase[0] eq "and" or $searchPhrase[0] eq "&&")) { my $result2; my $newResult = {}; shift @searchPhrase; ($result2, @searchPhrase) = &searchExpr3($db, @searchPhrase); foreach my $file (keys %$result) { if (exists $$result2{$file}) { $$newResult{$file} = $$result{$file} + $$result2{$file}; } } $result = $newResult; } return $result, @searchPhrase; } sub searchExpr3 { return &searchExpr4(@_); } #sub searchExpr3 { # my ($db, @searchPhrase) = @_; # # my ($result, @restPhrase) = &searchExpr4($db, @searchPhrase); # # return $result if $#restPhrase == -1; # # while ($restPhrase[0] eq "andnot" or $restPhrase[0] eq "&!") { # my $rightResult = &searchExpr4($db, shift @restPhrase); # # ANDNOT merge $rightResult into $result # } #} sub searchExpr4 { return &searchExpr5(@_); } #sub searchExpr4 { # my ($db, @searchPhrase) = @_; # # my ($result, @restPhrase) = &searchExpr5($db, @searchPhrase); # # return $result if $#restPhrase == -1; # # while ($restPhrase[0] eq "near") # my $rightResult = &searchExpr5($db, shift @restPhrase); # # NEAR merge $rightResult into $result # } #} sub searchExpr5 { my ($db, @searchPhrase) = @_; my $result; if ($searchPhrase[0] eq "(") { shift @searchPhrase; ($result, @searchPhrase) = &searchExpr($db, @searchPhrase); if (shift @searchPhrase ne ")") { die "Unterminated parenthesis in search request"; } } else { $result = &searchTerm($db, shift @searchPhrase); while (defined $searchPhrase[0] and $searchPhrase[0] ne "(" and $searchPhrase[0] ne "and" and $searchPhrase[0] ne "or" and $searchPhrase[0] ne "andnot" and $searchPhrase[0] ne "near") { my $result2 = &searchTerm($db, shift @searchPhrase); my $newResult = {}; foreach my $file (keys %$result) { if (exists $$result2{$file}) { $$newResult{$file} = $$result{$file} + $$result2{$file}; } } $result = $newResult; } } return $result, @searchPhrase; } # stop word list taken directly from Isearch (sw.hxx) my @stopwords = ( "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "about", "above", "according", "across", "actually", "adj", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anywhere", "are", "aren't", "around", "as", "at", "b", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "behind", "being", "below", "beside", "besides", "between", "beyond", "billion", "both", "but", "by", "c", "can", "can't", "cannot", "caption", "co", "co.", "could", "couldn't", "d", "did", "didn't", "do", "does", "doesn't", "don't", "down", "during", "e", "each", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "f", "few", "fifty", "first", "five", "for", "former", "formerly", "forty", "found", "", "four", "from", "further", "g", "h", "had", "has", "hasn't", "have", "haven't", "he", "he'd", "he'll", "he's", "hence", "her", "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "i", "i'd", "i'll", "i'm", "i've", "ie", "if", "in", "inc.", "indeed", "instead", "into", "is", "isn't", "it", "it's", "its", "itself", "j", "k", "l", "last", "later", "latter", "latterly", "least", "less", "let", "let's", "like", "likely", "ltd", "m", "made", "make", "makes", "many", "maybe", "me", "meantime", "meanwhile", "might", "million", "miss", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "must", "my", "myself", "n", "namely", "neither", "never", "nevertheless", "next", "nine", "ninety", "no", "nobody", "none", "nonetheless", "noone", "nor", "not", "nothing", "now", "nowhere", "o", "of", "off", "often", "on", "once", "one", "one's", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "overall", "own", "p", "per", "perhaps", "q", "r", "rather", "recent", "recently", "s", "same", "seem", "seemed", "seeming", "seems", "seven", "seventy", "several", "she", "she'd", "she'll", "she's", "should", "shouldn't", "since", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "stop", "such", "t", "taking", "ten", "than", "that", "that'll", "that's", "that've", "the", "their", "them", "themselves", "then", "thence", "there", "there'd", "there'll", "there're", "there's", "there've", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "they'd", "they'll", "they're", "they've", "thirty", "this", "those", "though", "thousand", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "toward", "towards", "trillion", "twenty", "two", "u", "under", "unless", "unlike", "unlikely", "until", "up", "upon", "us", "used", "using", "v", "very", "via", "w", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "well", "were", "weren't", "what", "what'll", "what's", "what've", "whatever", "when", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "who'd", "who'll", "who's", "whoever", "whole", "whom", "whomever", "whose", "why", "will", "with", "within", "without", "won't", "would", "wouldn't", "x", "y", "yes", "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "z" ); my @specials = ("and", "or", "andnot", "near"); # (([^\s()&\|"{]|&(?![&!])|\|(?!\|)|"[^"]*"|\{[^}]*\})+|$|$|&&|&!|\|\|) sub search { my ($db, $searchWord) = @_; my $searchPhrase = $searchWord; my @searchPhrase; while ($searchPhrase) { if ($searchPhrase =~ s!^\s+!!) { # Throw away whitespace } elsif ($searchPhrase =~ s!^"([^"]*)"!!) { my $word = lc $1; push @searchPhrase, $word unless grep {$_ eq $word} @stopwords; } elsif ($searchPhrase =~ s!^([[:alnum:]]+\*?(:[0-9]+)?)!!) { my $word = lc $1; if (grep {$_ eq $word} @specials) { push @searchPhrase, $word; } else { push @searchPhrase, $word unless grep {$_ eq $word} @stopwords; } } else { $searchPhrase =~ s!^(.)!!; push @searchPhrase, $1; } } print Dumper(\@searchPhrase) if $debug>0; &openMDT($db); my $result; ($result, @searchPhrase) = &searchExpr($db, @searchPhrase); die "unterminated search expression" if (@searchPhrase > 0); close(MDT); print "search done\n" if $debug>0; return $result; } 1;