adlerl1.txt 0100755 0000000 0000000 00000000256 07446130704 011655 0 ustar root root image=/boot/vmlinuz-2.4.2-2 label=linux initrd=/boot/initrd-2.4.2-2.img read-only root=/dev/hda6 append="ramdisk_size=102400" adlerl2.txt 0100755 0000000 0000000 00000000316 07446131116 011651 0 ustar root root raiddev /dev/md0 raid-level 1 chunk-size 32 persistent-superblock 1 nr-spare-disks 0 nr-raid-disks 2 device /dev/sda1 raid-disk 0 device /dev/ram raid-disk 1 bondl1.txt 0100755 0000000 0000000 00000001133 07445732414 011510 0 ustar root root Listing 1: Create the database CREATE DATABASE lisa; USE lisa; CREATE TABLE doc ( id INT PRIMARY KEY NOT NULL AUTO_INCREMENT, url CHAR(255), title CHAR(255), UNIQUE (url) ); ALTER TABLE doc ADD INDEX (id); ALTER TABLE doc ADD INDEX (url); CREATE TABLE word ( id INT NOT NULL, word CHAR(255) NOT NULL, position INT ); ALTER TABLE word ADD INDEX (id); ALTER TABLE word ADD INDEX (word); ALTER TABLE word ADD INDEX (position); CREATE TABLE spider ( url CHAR(255), status CHAR(255) ); ALTER TABLE spider ADD INDEX (url); ALTER TABLE spider ADD INDEX (status); bondl2.txt 0100755 0000000 0000000 00000011667 07451125664 011526 0 ustar root root Listing 2: A basic spider and indexer #!/usr/bin/perl -w use strict; use vars qw($dbh $statement $sth $url $content $code $status $title $text @links); use LWP::RobotUA; # Spiders public sites politely use LWP::UserAgent; # Spiders selfishly (okay for your own servers) use HTML::LinkExtor; # For extracting links use URI::URL; # For making relative URL's absolute use DBI; # For access to SQL database (here, MySQL) # Connect to the SQL server $dbh = DBI->connect('DBI:mysql:lisa', "lisa", "lisa", {PrintError=>0,RaiseError=>0}) || die; # Get an item to spider from the spider table $statement = "SELECT url FROM spider WHERE status IS NULL"; $sth = $dbh->prepare($statement); $sth->execute(); # While we have items... while ( ($url) = $sth->fetchrow_array ){ ($title, $content, $text) = ""; print "Spidering: $url\n"; # Set the URL status from null to zero in spider table $dbh->do(" UPDATE spider SET status=0 WHERE url='$url' ") or die "$DBI::errstr"; # Fetch the item, extract what we need ($status,$title,$text,@links) = spider($url); if ($status){ # Store all the info in the database update_db($url,$title,$text,@links); } } # Finish $sth->finish; $dbh->disconnect; #----------------------------------------------------------- sub spider { my $url = shift; # Use this if spidering your own servers (ignores robots.txt) my $ua = LWP::UserAgent->new; # Otherwise use this (slower but respects robots.txt) # my $ua = LWP::RobotUA->new('LISA 1.0','youremail@address'); # Request headers only, to see the content-type my $request = HTTP::Request->new(HEAD => $url); my $result = $ua->request($request); # Limit ourselves to files of type 'text/html' my $content_type = $result->header('Content-type'); return undef unless $content_type eq "text/html"; # Fetch the entire request (not just headers) $request = HTTP::Request->new(GET => $url); $result = $ua->request($request); # Update status in the spider table, using HTTP status code my $code = $result->code; my $url_q = $dbh->quote($url); $dbh->do("UPDATE spider SET status=$code WHERE url=$url_q ") or print "$DBI::errstr"; if ($result->code == 200){ # Get title, body $title = $result->title; $content = $result->content; $text = strip($content); # Standard way of extracting links... @links = (); sub callback { my($tag, %attr) = @_; return if $tag ne 'a'; push(@links, values %attr); } my $p = HTML::LinkExtor->new(\&callback); $p->parse($content) or die; # Expand all URLs to absolute ones my $base = $result->base; # Fix up links (remove trailing #; make URL's absolute) @links = map { s/#.*//; $_ = url($_, $base)->abs; } @links; } return ($code,$title,$text,@links); } sub update_db { ($url,$title,$text,@links) = @_; return undef unless length($text) > 10; # Quote the title before inserting my $url_q = $dbh->quote($url); my $title_q = $dbh->quote($title); # Store URL and title in the doc table $dbh->do("INSERT INTO doc (url,title) VALUES ($url_q,$title_q)") or die "$DBI::errstr"; # Get the auto_generated id my $doc_id = $dbh->{'mysql_insertid'}; # For each word, create an entry in the word table my @words = split(/ /,$text); my $counter = 0; foreach my $word(@words){ # Quote before inserting $word = $dbh->quote($word); $dbh->do("INSERT INTO word (id,word,position) VALUES ($doc_id,$word,$counter)") or print "$DBI::errstr"; $counter++; } # Insert links into the spider table foreach my $link(@links){ # Add some criteria below -- e.g.: if ($link =~ /http.*csf.edu/){ # Quote before inserting my $link_q = $dbh->quote($link); $dbh->do("INSERT INTO spider (url) VALUES ($link_q)"); }else{ print "Ignoring link to $link\n"; } } } sub strip { my ($html) = shift; $html =~ s/[\r\n]/ /g; # Kill linefeeds, etc. $html =~ s/(.*)
We're sorry, but your search found no results at this time.\n";} } sub header { print "Content-type: text/html\n\n"; print "
Search: $words\n"; } sub footer { print "