#!/usr/bin/perl
# Web Site Mapper
# Copyright (C) 2004  Timm Murray
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#

use strict;
use warnings;
use WWW::Mechanize;
use URI;
use YAML;
use Time::HiRes qw/time/;
use Getopt::Long;

# maximum re-visit of each page
my $max_seen = 10;
my %seen;

my @sites = qw(
#blog.rot13.org
ss-petrinja.cms-qa.skole.hr
);

sub DISALLOWED_SCHEMES ()  { qw( mailto javascript ) }

my $debug = 0;
my $verbose = 0;

GetOptions(
	'debug!'	=> \$debug,
	'verbose!'	=> \$verbose,
);

sub get_page {
	my ($data, $uri) = @_;

	my $page_to_load = $uri->canonical;

	# Don't process pages that have already been loaded
	if(exists $data->{$page_to_load}) {
		warn "\t$page_to_load already indexed\n" if $debug;
		return;
	}

	# Don't process pages that aren't listed in the sites above
	unless( grep { lc($uri->host) eq lc($_) } @sites) {
		warn "\t$page_to_load not in allowed sites\n" if $debug;
		return;
	}

	print "$page_to_load";

	my $t = time();

	my $mech = WWW::Mechanize->new();
	my $response = $mech->get( $page_to_load );

	$t = time() - $t;

	$data->{$page_to_load}{status} = $mech->status;

	if($mech->success) {

		$data->{$page_to_load}{content_type} = $mech->ct;
		$data->{$page_to_load}{title} = $mech->title;

		my @links = map { $_->url_abs } $mech->links;

		if ($debug) {
			warn "\tResponse successful\n";
			warn "\tContent-type: ", $data->{$page_to_load}{content_type}, "\n";
			warn "\tTitle: ", $data->{$page_to_load}{title}, "\n";
			warn "\tLinks: " . join("\n", map "\t\t$_", @links) . "\n";
		} else {
			my $size = length( $mech->content );
			print " ", $mech->status, " ", $mech->ct, sprintf(" %d in %.2fs (%.2f b/s)", $size, $t, $size / $t), "\n";
		}

		$data->{$page_to_load}{links} = [];

		foreach my $link (@links) {
			my $uri = URI->new($link);
			next if grep { $uri->scheme eq $_ } DISALLOWED_SCHEMES;

			my $url = $uri->canonical->as_string;
			$url =~ s/#.*$//;
			warn "\tFollowing $url\n" if $debug;

			# discriminate against urls which are seen to many times
			my $url_no_params = $url;
			$url_no_params =~ s/\?.*$//;
			$seen{$url_no_params}++;
			if ($seen{$url_no_params} > $max_seen) {
				print "skipped $url_no_params, seen $seen{$url_no_params}\n" if ($verbose);
				next;
			}
			push @{ $data->{$page_to_load}{links} }, $url;

			get_page( $data, $uri );

		}
	} else {
		warn "\tResponse unsuccessful\n" if $debug;
	}
}


{
	foreach my $site ( @sites ) {
		next if $site =~ m/^#/;	# skip comment

		my $data;
		my $start_page = $site =~ m!^http://!i ? $site : 'http://' . $site;
		my $uri = URI->new($start_page);
		get_page( $data, $uri );
	
		print Dump($data) if ($debug);
		print "Total of ", keys(%$data) + 1, " pages crawled\n";
	}

}

