Full-Text Searching in Perl
by Tim Kientzle


Example 1: 
#!/usr/local/bin/perl
require 5;
use DB_File;
use Fcntl;
tie(%index,DB_File,'index.db',
    O_RDONLY, 0, $DB_File::DB_BTREE);
foreach $word (@ARGV) { # Find each word
  $keys = $worddb{lc $word};
  foreach $key (unpack("n*",$keys)) {
    $matches{$key}++;
  }
}
@matches = sort # Rank by total matches
  { $matches{$b} <=> $matches{$a} 
     || $a <=> $b }
  (keys %matches);
foreach $key (@matches) { # Print names
  $name = $index{pack("xn",$key)};
  print "$matches{$key}: $name\n";
}
untie(%index); # Release database



Example 2: 

$DB_File::DB_BTREE->{cachesize} = 10_000_000; # 10meg cache
$DB_File::DB_BTREE->{psize} = 32*1024; # 32k pages


Table 1: 

Total size of corpus: 199 MB
Number of HTML files: 2545
Total size of HTML files: 64 MB

Total size of index.db database: 8 MB
Time to search for "perl search engine": 0.2 seconds
      (4 files matched all three terms, 934 matched at least one)
Time to build index using Listing One: over 3 hours!
      (But, only 14 minutes of CPU time!)
Time to build index using optimized indexer: 20 minutes
      (15 minutes of CPU time)


Figure 1: 

[[Note: Author has a graphical version of this.]]

Page 1:  ..... navy <page 98> rosebud <page 406> victory ....

'perl' is between 'navy' and 'rosebud'

Page 98:   .... parsimonious <page 237> perilous <page 73> persimmon ...
'perl' is between 'perilous' and 'persimmon'

Page 73:      .... perl <data> ....


Listing One
#!/usr/local/bin/perl
require 5;
use DB_File;    # Access DB databases
use Fcntl;      # Needed for above...
use File::Find; # Directory searching
undef $/; # Don't obey line boundaries
$currentKey = 0;

# Single database version:
#    Stores file entries in index.db as <NULL><ASCII file number>
#    The leading NULL prevents any word entries from colliding.
############################################################################
unlink("index.db");         # Delete old index.db
tie(%index,'DB_File',"index.db",    # Open new one
    O_RDWR | O_CREAT, 0644, $DB_File::DB_BTREE) ;
find(\&IndexFile,"articles");       # Index all of the files
untie(%index);                      # Close database
###########################################################################
sub IndexFile {
    if(!-f) { return; }
    if(/\.html?$/) {                # Handle HTML files
    print "$File::Find::name\n";
    open(HTML_FILE,$_);
    my($text) = <HTML_FILE>;        # Read entire file
    $text =~ s/<[^>]*>//g;          # Strip out all HTML tags
    my($wordsIndexed) = &IndexWords($text,$currentKey);
    $filedb{pack"xn",$currentKey} = $File::Find::name;
    $currentKey++;
    }
}
###########################################################################
sub IndexWords {
    my($words, $fileKey) = @_;
    my(%worduniq);          # for unique-ifying word list
    # Split text into Array of words
    my(@words) = split(/[^a-zA-Z0-9\xc0-\xff\+\/\_]+/, lc $words);
    @words = grep { $worduniq{$_}++ == 0 }         # Remove duplicates
         grep { s/^[^a-zA-Z0-9\xc0-\xff]+//; $_ }  # Strip leading punct
             grep { length > 1 }                   # Must be > 1 char long
             grep { /[a-zA-Z0-9\xc0-\xff]/ }       # must have alphanumeric
         grep { !/[\x00-\x01f]/ }                  # No ctrl chars
             @words;
    foreach (sort @words) { # Add file key to each word
    my($a) = $index{$_};
    $a .= pack "n",$fileKey;
        $index{$_} = $a;
    }
    return scalar(@words);    # Return count of words indexed
}

Listing Two
# Flush temporary in-memory %wordCache to disk database %index
sub FlushWordCache {
    my($word,$entry);
    # Do merge in sorted order to improve cache response of on-disk DB
    foreach $word (sort keys %wordCache) {
    $entry = $wordCache{$word};
    if(defined $index{$word}) {
        my($codedList);
        $codedList = $index{$word};
        $entry = &MergeLists($codedList,$entry);
    }
    # Store merged list into database
    $index{$word} = $entry;
    }
    %wordCache = (); # Empty the holding queue
}
###########################################################################
sub MergeLists {
    my($list);
    foreach (@_) { $list .= $_; }              # append all the lists
    my(@unpackedList) = unpack("n*",$list);    # Unpack into integers
    my(%uniq); # sort and unique-ify
    @unpackedList = grep { $uniq{$_}++ == 0 }  # Remove duplicates
                    sort { $a <=> $b }         # Sort
                    @unpackedList;
    return pack("n*",@unpackedList);           # repack
}


1


