Listing 2: A basic spider and indexer

#!/usr/bin/perl -w
use strict;
use vars qw($dbh $statement $sth $url $content $code $status $title $text @links);
 
use LWP::RobotUA; # Spiders public sites politely
use LWP::UserAgent;      # Spiders selfishly (okay for your own servers)
use HTML::LinkExtor; # For extracting links
use URI::URL;            # For making relative URL's absolute
use DBI;          # For access to SQL database (here, MySQL)
 
# Connect to the SQL server
    $dbh = DBI->connect('DBI:mysql:lisa', "lisa", "lisa", {PrintError=>0,RaiseError=>0}) || die;
 
# Get an item to spider from the spider table
    $statement = "SELECT url FROM spider WHERE status IS NULL";
    $sth = $dbh->prepare($statement);
    $sth->execute();
 
# While we have items...
    while ( ($url) = $sth->fetchrow_array ){
 
           ($title, $content, $text) = "";
           print "Spidering: $url\n";
 
           # Set the URL status from null to zero in spider table
           $dbh->do(" UPDATE spider SET status=0 WHERE url='$url' ") or die "$DBI::errstr";
 
           # Fetch the item, extract what we need
           ($status,$title,$text,@links) = spider($url);
 
           if ($status){
                  # Store all the info in the database
                  update_db($url,$title,$text,@links);
           }
    }
 
# Finish
$sth->finish;
$dbh->disconnect;
 
#-----------------------------------------------------------
 
sub spider {
 
    my $url = shift;
 
    # Use this if spidering your own servers (ignores robots.txt)
    my $ua = LWP::UserAgent->new; 
 
    # Otherwise use this (slower but respects robots.txt)
    # my $ua = LWP::RobotUA->new('LISA 1.0','youremail@address');
 
    # Request headers only, to see the content-type
    my $request = HTTP::Request->new(HEAD => $url);
    my $result = $ua->request($request);
 
    # Limit ourselves to files of type 'text/html'
    my $content_type = $result->header('Content-type');
    return undef unless $content_type eq "text/html";
 
    # Fetch the entire request (not just headers)
    $request = HTTP::Request->new(GET => $url);
    $result = $ua->request($request);
 
    # Update status in the spider table, using HTTP status code
    my $code = $result->code;
    my $url_q = $dbh->quote($url);
    $dbh->do("UPDATE spider SET status=$code WHERE url=$url_q ") or print "$DBI::errstr";    
 
    if ($result->code == 200){
           # Get title, body
           $title = $result->title;
           $content = $result->content;
           $text = strip($content);
           
           # Standard way of extracting links...
           @links = ();
           sub callback {
                  my($tag, %attr) = @_;
                  return if $tag ne 'a';
                  push(@links, values %attr);
             }
 
           my $p = HTML::LinkExtor->new(\&callback);
           $p->parse($content) or die;
 
           # Expand all URLs to absolute ones
             my $base = $result->base;
 
           # Fix up links (remove trailing #; make URL's absolute)
             @links = map { s/#.*//; $_ = url($_, $base)->abs; } @links;
    }
    return ($code,$title,$text,@links);
}
 
sub update_db {
 
    ($url,$title,$text,@links) = @_;
    return undef unless length($text) > 10;
    
    # Quote the title before inserting
    my $url_q = $dbh->quote($url);
    my $title_q = $dbh->quote($title);
 
    # Store URL and title in the doc table
    $dbh->do("INSERT INTO doc (url,title) VALUES ($url_q,$title_q)") or die "$DBI::errstr";
 
    # Get the auto_generated id
    my $doc_id = $dbh->{'mysql_insertid'};
 
    # For each word, create an entry in the word table
    my @words = split(/ /,$text);
    my $counter = 0;
    foreach my $word(@words){
           # Quote before inserting
           $word = $dbh->quote($word);
           $dbh->do("INSERT INTO word (id,word,position) VALUES ($doc_id,$word,$counter)") or print "$DBI::errstr";
           $counter++;
    }
 
    # Insert links into the spider table
    foreach my $link(@links){
           # Add some criteria below -- e.g.:
           if ($link =~ /http.*csf.edu/){
                  # Quote before inserting
                  my $link_q = $dbh->quote($link);
                  $dbh->do("INSERT INTO spider (url) VALUES ($link_q)");
           }else{
                  print "Ignoring link to $link\n";
           }
    }
}
 
sub strip {
 
    my ($html) = shift;
    $html =~ s/[\r\n]/ /g;       # Kill linefeeds, etc.
    $html =~ s/(.*)<BODY.*?>//i; # Kill up to BODY tag
    $html =~ s/<.*?>//g;         # Kill all tags
    $html =~ s/&.*?\;/ /g;       # Strip out HTML entities like &nbps;
    $html =~ s/[^a-zA-Z0-9]/ /g; # Kill punctuation, etc.
    $html =~ s/^\s+//g;          # Kill leading spaces
    $html =~ s/\s+/ /g;          # Kill repeating spaces
    $html =~ s/\s+$//g;          # Kill trailing space
    return $html;
}

