#!/usr/bin/perl -W

#usage example:
# ./loadwebpage.pl www.sportsline.com /nba/scoreboard/20060301 > nbaresults20050605.txt


use strict;
use IO::Socket;
use HTML::Parser 3.00 ();

my ($domain, $document) = @ARGV;

&Main($domain, $document);

sub Main
{
    my ($domain, $document) = @_;
    my $data = LoadHtmlPage($domain, $document);
    my ($base) = $document =~ m|^(.*)/|;
    print "$base\n";
    my @pages = $data =~ m|<a href=\"($base/[^\"]*)\">|g;
    my %pagelist;

    foreach my $page (@pages)
    {
        $pagelist{$page} = 1;
    }

#    PrintTables($data, "20060000");

#    my @Links = GetHtmlPageLinks($data);
#    print "$_\n" foreach (@Links);

#    my $pagedata = LoadHtmlPage('www.sportsline.com','/nba/gamecenter/boxscore/NBA_20050602_DET@MIA');
#    my $pagedata = LoadHtmlPage('www.sportsline.com','/nba/scoreboard/20050401');
#    my $pagedata = LoadHtmlPage('www.nfl.com','/scores/2005/week1');
#    PrintTables($pagedata, '20050602');

    foreach my $page (reverse sort keys %pagelist)
    {
        print STDERR "Loading $page\n";
        my $pagedata = LoadHtmlPage($domain, $page);
        my ($date) = $page =~ m|$base/(.*)|;
        PrintTables($pagedata, $date);
    }
}

sub GetHtmlPageLinks
{
    my ($data) = @_;
    my @Links = $data =~ m|<a href=\"([^\"]*)\">|g;
    return @Links;
}

sub LoadHtmlPage
{
    my ($domain, $document) = @_;
    my $sock = IO::Socket::INET->new("$domain:80")
        or die "Could not open webpage : $!\n";
    
    print $sock "GET $document HTTP/1.0\n\n"; # Request web page
    
    my $data = "";
    
    while (<$sock>)
    {
        $data .= $_;
    }
    close($sock);
    
    $data =~ tr/\r\n//d;
    $data =~ s|<!--.*?-->||gs;
    $data =~ s|<script.*?</script>||igs;
    $data =~ s|&nbsp;| |gi;
    $data =~ s|&#[0-9]*;|#|g;
    $data =~ s|<style.*?</style>||igs;
    $data =~ s|^.*?<html>|<html>|i; # strip off leading junk
    $data =~ s|#[0-9]*||g;
    $data =~ s|\([0-9]+-[0-9]+\)||g; # Remove the current standing of the team
    
    return $data;
}

sub PrintTables
{
    our ($pagedata, $date) = @_;
    our %inside;
    our $depth = 0;
    our $lasttag = "";
    our $lastend = "";
    our $lasttext = "";
    our $infinal = 0;
    our ($team1, $team2, $score1, $score2);
    $inside{table} = 0;
    $inside{tr} = 0;
    $inside{td} = 0;

    my $p = HTML::Parser->new(
        api_version => 3,
        handlers    => [start => [\&tag_start, "tagname"],
                        end   => [\&tag_end, "tagname"],
                        text  => [\&text, "dtext"],
                       ],
        marked_sections => 1,
    );
    
    $p->report_tags(qw(table tr td));
    $p->parse($pagedata) || die "Can't parse: $!\n";
    print "\n";

    sub tag_start
    {
        my($tag) = @_;
#        print "<$tag>";
        $lasttag = $tag;
#        print("\n" . (" " x $inside{table}) . "<<\n") if ($tag eq 'table');
#        print( (" " x $inside{table}) . "{ ") if ($tag eq 'tr'); # and $infinal);
#        print("\t") if ($tag eq 'td');
        $inside{$tag}++;
        $depth++;
    }
    
    sub tag_end
    {
        my($tag) = @_;
#        print "</$tag>";
        $inside{$tag}--;
        $depth--;
#        print( (" " x $inside{table}) . ">>\n" ) if ($tag eq 'table');
#        print "}\n" if ($tag eq 'tr'); # and $infinal);
#        print ",\t" if ($tag eq 'td'); # and $infinal);
        if    ($tag eq 'table' and $infinal) { 
            $infinal = 0;
            print "$date $team1: $score1 -- $team2: $score2\n";
        }
        if    ($infinal == 2 and $tag eq 'tr') { $score1 = $lasttext; }
        elsif ($infinal == 3 and $tag eq 'tr') { $score2 = $lasttext; }
        $lastend = $tag;
    }
    
    sub text
    {
        my ($text) = @_;
        return if (($lasttag ne 'td') or !($text =~ /[a-zA-Z0-9]/));
        if    ($infinal == 1 and $lastend eq 'tr') { $team1 = $text; $infinal = 2; }
        elsif ($infinal == 2 and $lastend eq 'tr') { $team2 = $text; $infinal = 3; }
        
        if ($text eq 'Final' and $lasttag eq 'td') { $infinal = 1 };
        if ($infinal) { $lasttext = $text; }
#        print $text;
    }
}
