#!/usr/bin/perl -W
#
# spider.pl   Set tabstops to 3.
#

$| = 1;
# 0=no debug, 1=display progress, 2=complete dump
$DEBUG = 2;
# Check hyperlinks to other hosts?
$SPANHOSTS = "off";	

if(scalar(@ARGV) < 2){
    print "Usage: $0 <fully-qualified-URL> <search-phrase>\n";
    exit 1;
}

# Initialize.
%URLqueue = ();
chop($client_host=`hostname`);
$been = 0;
$search_phrase = $ARGV[1];

# Load the queue with the first URL to hit.
$URLqueue{$ARGV[0]} = 0;
$thisURL = &find_new(%URLqueue);

# While there's a URL in our queue which we haven't looked at ...
while($thisURL ne ""){

    # Progress report.
    $count = 0;
    while(($key,$value) = each(%URLqueue)){
        $count ++;
    }
    print "-----------------------------------------\n" if($DEBUG>=1);
    printf("Been: %d  To Go: %d\n", $been, $count-$been)
    if($DEBUG>=1);
    print "Current URL: $thisURL\n" if($DEBUG>=1);
    &dump_stack() if($DEBUG>=2);
    
    # Split the protocol from the URL.
    ($protocol, $rest) = $thisURL =~ m|^([^:/]*):(.*)$|;
    
    # If the protocol is http, fetch the page and process it.
    if($protocol eq "http"){
    
        print "rest = $rest\n";
        # Split out the hostname, port and document.
        ($server_host, $port, $document) =
                $rest =~ m|^//([^:/]*):*([0-9]*)/*([^:]*)$|;

        # Get the page of text and remove CR/LF characters and HTML
        # comments from it.
        $port = 80;
        print "$client_host, $server_host, $port, $document\n";
        $page_text = &get_http($client_host, $server_host, $port, $document);
        $page_text =~ tr/\r\n//d;
        $page_text =~ s|<!--[^>]*-->||g;

        # Report if our search string is found here.
        if($page_text =~ m|$search_phrase|i){
                print "$thisURL\n"
        }	

        # Find anchors in the HTML and update our list of URLs..
        (@anchors) = $page_text =~ m|<A[^>]*HREF\s*=\s*"([^">]*)"|gi;
        foreach $anchor (@anchors){
            $newURL = &fqURL($thisURL, $anchor);
            if($URLqueue{$newURL} > 0){

                # Increment the count for URLs we've already 
                # checked out.
                $URLqueue{$newURL}++;

            }else{

                # Add a zero record for URLs we haven't 
                # encountered.
                # Optionally, ignore URL's which point to other
                # hosts.
                ($new_host) =
                $newURL =~ m|^[^:/]*:/*([^/:]*):*[0-9]*/*[^:]*$|;
                if($SPANHOSTS eq "on" || $new_host eq 
                    $server_host){
                    $URLqueue{$newURL}=0;
                }
            }
        }
    }else{
            print "Protocol '$protocol' ignored.\n" if($DEBUG>=1);
    }
    
    # Record the fact that we've been here, and get a new URL to process.
    $URLqueue{$thisURL} ++;
    $been ++;
    $thisURL = &find_new(%URLqueue);
    
}
exit;

#--------------------------------------------------------------
# Build a fully specified URL.
#--------------------------------------------------------------
sub fqURL
{
    local($thisURL, $anchor) = @_;
    local($has_proto, $has_lead_slash, $currprot, $currhost, $newURL);
    
    # Strip anything following a number sign '#', because its
    # just a reference to a position within a page.
    $anchor =~ s|^.*#[^#]*$|$1|;
    
    # Examine anchor to see what parts of the URL are specified.
    $has_proto = 0;
    $has_lead_slash=0;
    $has_proto = 1 if($anchor =~ m|^[^/:]+:|);
    $has_lead_slash = 1 if ($anchor =~ m|^/|);
    
    if($has_proto == 1){
    
        # If protocol specified, assume anchor is fully qualified.
        $newURL = $anchor;
    
    }
    elsif($has_lead_slash == 1){
    
        # If document has a leading slash, it just needs protocol and host.
        ($currprot, $currhost) = $thisURL =~ m|^([^:/]*):/+([^:/]*)|;
        $newURL = $currprot . "://" . $currhost . $anchor;
    
    }	
    else{
    
        # Anchor must be just relative pathname, so append it to current URL.
        ($newURL) = $thisURL =~ m|^(.*)/[^/]*$|;
        $newURL .= "/" if (! ($newURL =~ m|/$|));
        $newURL .= $anchor;
    
    }
    if($DEBUG >=2){
        print "Link Found\n   In:$thisURL\n   Anchor:$anchor\n   Result: $newURL\n"
    }
    return $newURL;
}

#---------------------------------------------------------------
# Do a linear search of the URL stack to find a URL with a data
# value of 0 (i.e. one we haven't checked out yet).
#---------------------------------------------------------------
sub find_new
{
    local(%URLqueue) = @_;
    local($key, $value);
    
    while(($key, $value) = each(%URLqueue)){
        return $key if($value == 0);
    }
    return "";
}

#-------------------------------------------------------------------
# Debugging utility.
#-------------------------------------------------------------------
sub dump_stack
{
    local($key);
    local($done, $togo) = ("", "");
    
    foreach $key (keys(%URLqueue)){
        if($URLqueue{$key} == 0){
            $togo .= "  " . $key . "\n";
        }else{
            $done .= "  " . $key . " (hitcount = "
                . $URLqueue{$key} . ")\n";
        }
    }
    
    print "Been There:\n" . $done;
    print "To Go:\n" . $togo;
    print "------- Hit Q to Quit, Enter to Continue -------\n";
    read(STDIN, $key, 1);
    exit(1) if($key eq 'Q' || $key eq 'q');
}

#-------------------------------------------------------------------------
# Get the page indicated by the $server_host and $document parameters.
#-------------------------------------------------------------------------
sub get_http
{
    local($client_host, $server_host, $port, $document) = @_;
    local($name,$aliases,$type,$len);
    local($this,$thisaddr,$that,$thataddr);
    local($client_host, $sockaddr, $a,$b,$c,$d);
    local($page, $header, $header_text, $content);
    
    # Some constants used to access the TCP network.
    $AF_INET=2;
    $SOCK_STREAM=1;
    
    # Use default http port if none specified.
    $port = 80; #if($port == 0);
    
    # Get the protocol number for TCP.
    ($name,$aliases,$proto)=getprotobyname("tcp");
    
    # Get the IP addresses for the two hosts.
    ($name,$aliases,$type,$len,$thisaddr) = gethostbyname($client_host);
    ($name,$aliases,$type,$len,$thataddr) = gethostbyname($server_host);
    
    # Check we could resolve the server host name.
    ($a,$b,$c,$d) = unpack('C4', $thataddr);
    if($a eq "" && $b eq "" && $c eq  "" && $d eq ""){
        print "ERROR: Unknown host $server_host.\n";
        return "";
    }
    print "Server: $server_host ($a.$b.$c.$d)\n" if($DEBUG>=2);
    
    # Pack the AF_INET magic number, the port, and the (already packed) IP
    # addresses into the same format as the C structure would use. Note
    # this is architecture dependent: this pack format works for 32 bit
    # architectures.
    $sockaddr="S n a4 x8";
    $this=pack($sockaddr, $AF_INET, 0, $thisaddr);
    $that=pack($sockaddr, $AF_INET, $port, $thataddr);
    
    # Create the socket and connect.
    if(!socket(S, $AF_INET, $SOCK_STREAM, $proto)){
        print "ERROR: Cannot create socket.\n";
        return "";
    }
    print "Socket OK\n" if($DEBUG>=2);
    if(!connect(S, $that)){
        print "ERROR: Cannot connect to server $server_host,
            port $port.\n";
        return "";
    }
    print "Connect OK\n" if($DEBUG>=2);
    
    # Turn buffering in the socket off, and send request to the server.
    select(S); $| = 1; select(STDOUT);
    print S "GET /$document HTTP/1.0\n\n";
    
    # Receive the response. Check to ensure the response is of MIME
    # type text/html or text/plain.
    $page = "";
    $header = 1;
    $header_text = "";
    while(<S>){
        # Check if we've hit the end of the HTTP header (an empty line).
        # If we have, check for a content-type header line, and ensure
        # it is valid.
        if( m|^[\n\r]*$| ){
            $header = 0;
            ($content) = $header_text =~ m|Content-type: (\S+)|i;
            if($content ne "text/html" && $content ne "text/plain"){
                print "Content type '$content' ignored.\n" 
                        if($DEBUG>=1);
                last;
            }
        }
        # Save to a header string if we're still working on the HTTP
        # header.
        elsif($header == 1){
                $header_text .= "   " . $_;
        }
        # Otherwise, save to the html page string.
        else{
                $page .= $_;
        }
    
        print "$_\n";
        
        print "HTTP header: \n $header_text" if($DEBUG>=2);
        return $page;
    }
}
