#!/usr/bin/perl -w # ================================================================== # Links SQL - enhanced directory management system # # Website : http://gossamer-threads.com/ # Support : http://gossamer-threads.com/scripts/support/ # Revision : $Id: nph-verify.cgi,v 1.29 2001/12/30 23:17:07 alex Exp $ # # Copyright (c) 2001 Gossamer Threads Inc. All Rights Reserved. # Redistribution in part or in whole strictly prohibited. Please # see LICENSE file for full details. # ================================================================== use strict; use lib '.'; use vars qw/$USE_HTML $BLOCK_SIZE $MAX_WORKUNIT $MIN_WORKUNIT $MAX_CHILDREN $SPAWN_DELAY $FONT $TODAY $GOOD $BAD @LINKS_TO_CHECK/; use Links qw/$IN $DB $CFG/; use GT::SQL::Condition; use Links::Parallel; use Links::Tools; $| = 1; local $SIG{__DIE__} = \&Links::fatal; Links::init('.'); main(); sub main { # ------------------------------------------------------------------- # specific configuration, you can tweak for optimimum performance # you should not need to change these parameters, if you do, you should # know what you are doing # $BLOCK_SIZE = 5000; # the bulk link handler $MAX_WORKUNIT = 100; # a child unit can up to X links to check $MIN_WORKUNIT = 3; # but it must have at least (# links permitting) Y links to check $MAX_CHILDREN = 10; # the number of children allowed to exist $SPAWN_DELAY = 2; # the amount of time between successive spawns (to prevent load spiking) $FONT = 'font face="Tahoma,Arial,Helvetica" size="2"'; @LINKS_TO_CHECK = (); $GOOD = 0; $BAD = 0; $USE_HTML = exists $ENV{REQUEST_METHOD} ? 1 : 0; $TODAY = get_date(); # get the option parameters my $params = 0; my $id = 0; my ($method); if ($USE_HTML) { print $IN->header ( -nph => $CFG->{nph_headers} ); $_ = $IN->param ("do"); } else { $_ = join " ", @ARGV; # convert command line params to parseable commands CMD_LINE: { if (/--check_all/) { $_ = "check_links"; $method=5; last CMD_LINE; } if (/--check_from\s*([0-9]+)/) { $_ = "check_links"; $method=1; $params=$1; last CMD_LINE; } if (/--check_new/) { $_ = "check_links"; $method=4; last CMD_LINE; } if (/--check_problem/) { $_ = "check_links"; $method= 3; last CMD_LINE; } if (/--check_status\s*(-?[0-9]+)/) { $_ = "check_links"; $method=6; $params=$1; last CMD_LINE; } if (/--check\s*(-?[0-9]+)/) { $_ = "check_links"; $method=7; $id=$1; last CMD_LINE; } if (/--fix_302/) { $_ = "fix_302"; last CMD_LINE; } $_ = undef; }; }; SWITCH: { if (not defined) { if ($USE_HTML) { Links::admin_page ('tools_verify.html'); last SWITCH; } else { &command_line_help (); last SWITCH; }; } if (/^check_links/) { if ($USE_HTML) { if (defined $IN->param("method")) { check_links ( $IN->param("method") || 5, $IN->param("status") || 0, $IN->param("ID") || 0, $IN->param("days") || 0, $IN->param("since"), $IN->param("to") ); } else { html_verify_select ( calclinkstatus () ); }; } else { if (defined $method) { check_links ( $method, $params, $id, $params ); } else { }; }; last SWITCH; } }; } sub command_line_help { # ------------------------------------------------------------------- # Print out a usage summary. # print qq| This script checks the Links SQL database for link integrity. The following parameters may be used from the command line: For checking links: (one of) --check_from number_of_days_ago --check_problem --check_new --check_status status_code --check_all For fixing links: (one of) --fix_302 |; }; sub check_links { #---------------------------------------------------------------------- # generates the sql query that nabs the link subset that we want # to check then checks the links # my ($action,$status,$id,$days,$since,$to) = @_; # make sure everything is what we expect it to be $status = int ( $status ); $id = int ( $id ); $days = int ( $days ); $since ||= ''; $since =~ /([0-9]{4}\/[0-9]{1,2}\/[0-9]{1,2})/; $since = $1; $to ||= ''; $to =~ /([0-9]{4}\/[0-9]{1,2}\/[0-9]{1,2})/; $to = $1; # build the query condition. my $cond = new GT::SQL::Condition; if ( $action == 1 ) { # not checked in the last N days my $tmp_date = &get_date (time - (86400 * $days)); $cond->add ("Date_Checked", "<", $tmp_date); } elsif ( $action == 2 ) { # checked last between N and O $cond->add ("Date_Checked", ">", $since); $cond->add ("Date_Checked", "<", $to); } elsif ( $action == 3 ) { # problem links my $val = join ( ",", keys %Links::Tools::STATUS_BAD ); $cond->add ("Status", "IN", \"($val)"); } elsif ( $action == 4 ) { # new links $cond->add ("Status", "=", 0); } elsif ( $action == 5 ) { # everything $cond = {}; } elsif ( $action == 6) { # check something based on status code $cond->add ("Status", "=", $status); } elsif ( $action == 7 ) { # check a certain link $cond->add ("ID", "=", $id); }; # find out how many items need to be checked..., my $link_db = $DB->table ('Links'); my $count = $link_db->count($cond); if ($USE_HTML) { print qq~ Checking Links ~; print Links::header ('Checking Links ...', 'Links SQL is now attempting to check your links, please be patient, this can take a while.', 0); print '
';
    }
    else {
        print "Checking $count links ... \n";
    }

# demo
#   print "\nChecking links has been disabled in the demo\n\n
"; # return; if ($count == 0) { print "No links to check! Quitting!\n\n"; } else { my $offset = 0; my @stats; # Remove mod_perl env so verify-child doesn't think it's under mod_perl. my $env = delete $ENV{GATEWAY_INTERFACE}; while (1) { # run the query and register it $link_db->select_options ("LIMIT $offset,$BLOCK_SIZE"); my $sth = $link_db->select ($cond, ['ID']); $offset += $BLOCK_SIZE; if ($sth->rows()<=0) { last; }; while (($id) = $sth->fetchrow_array) { push @LINKS_TO_CHECK, int($id); } my $checker = new Links::Parallel ( 'MAX_WORKUNIT' => $MAX_WORKUNIT, 'MIN_WORKUNIT' => $MIN_WORKUNIT, 'SPAWN_DELAY' => $SPAWN_DELAY, 'MAX_CHILDREN' => $MAX_CHILDREN, 'child_path' => "$CFG->{admin_root_path}/verify-child.pl", 'path_to_perl' => $CFG->{path_to_perl}, 'on_response' => \&link_report, 'debug_level' => 1 ); $checker->{to_check} = \@LINKS_TO_CHECK; $checker->wait (); push @stats, $checker->get_stats (); }; &print_summary (\@stats); } if ($USE_HTML) { print "\n\n"; } } sub link_report { # ------------------------------------------------------------------- # Callback used when a link has been checked. # my ($key, $status, $response) = @_; $response =~ /([^\t]*)\t(.*)/; my $URL = $1; $response = $2; # Make sure we have a valid response. if (! defined $key or ($key !~ /^\d+$/)) { return; } my $link_db = $DB->table ('Links'); my $ver_db = $DB->table ('Verify'); $ver_db->add ( { LinkID => $key, Status => $status, Date_Checked => $TODAY } ); $link_db->update ( { Status => $status, Date_Checked => $TODAY }, { ID => $key } ); # let people know what's up $USE_HTML ? print "Checked $key - " : print "$key\t$URL\t"; if ( $Links::Tools::STATUS_OK { $status } ) { $GOOD++; print "Success ($status). Message: $Links::Tools::STATUS_OK{$status}"; } else { $BAD++; print ("Request Failed (" . ($status ? $status : "unresolvable") . ") Message: $Links::Tools::STATUS_BAD{$status}" ); }; print "\t$response\n"; } sub print_summary { #------------------------------------------------------------------------------ # prints out the statistics for this particular link verification run my $stats = shift; my ( $elapsed, $threads_spawned, $workunit ); my ( $max_chunk, $avg_chunk, $min_chunk ); my ( $max_unit, $avg_unit, $min_unit, $num_units ); my ( $max_capita, $avg_capita, $min_capita ); for (@$stats) { my ( $nt_threads, $nt_elapsed, $nt_workunit_stats ) = @$_; $threads_spawned += $nt_threads; $elapsed += $nt_elapsed; $max_chunk = $nt_elapsed if ( !defined ($max_chunk) or ( $max_chunk < $nt_elapsed ) ); $min_chunk = $nt_elapsed if ( !defined ($min_chunk) or ( $min_chunk > $nt_elapsed ) ); foreach $workunit ( values %$nt_workunit_stats ) { my ( $wu_start, $wu_numchecked, $wu_elapsed ) = @$workunit; $num_units++; $avg_unit += $wu_elapsed; $max_unit = $wu_elapsed if ( !defined ($max_unit) or ( $max_unit < $wu_elapsed ) ); $min_unit = $wu_elapsed if ( !defined ($min_unit) or ( $min_unit > $wu_elapsed ) ); $max_capita = ( $wu_elapsed / $wu_numchecked ) if (($wu_elapsed) and (!defined ($max_capita) or ( $max_capita < ($wu_elapsed / $wu_numchecked )))); $min_capita = ( $wu_elapsed / $wu_numchecked ) if (($wu_elapsed) and (!defined ($min_capita) or ( $min_capita > ($wu_elapsed / $wu_numchecked )))); } }; my $total = $GOOD + $BAD; $avg_chunk = ( $elapsed / ( $#$stats + 1 ) ) if ( $#$stats >= 0 ); $avg_capita = ( $avg_unit / $total ) if ( $total ); $avg_unit = ( $avg_unit / $num_units ) if ( $num_units ); print qq| ------------------------------------------- Summmary: Good links: $GOOD Broken links: $BAD Total links: $total Statistics: Elapased Time: $elapsed seconds Children Spawned: $threads_spawned seconds Max Time per chunk: $max_chunk seconds Avg Time per chunk: $avg_chunk seconds Min Time per chunk: $min_chunk seconds Max Time for unit: $max_unit seconds |; printf "Avg Time for unit: %.2f seconds\n", $avg_unit; print "Min Time for unit: $min_unit seconds\n"; printf "Max Time per link: %.2f seconds\n", $max_capita; printf "Avg Time per link: %.2f seconds\n", $avg_capita; printf "Min Time per link: %.2f seconds\n", $min_capita; }; sub get_date { # -------------------------------------------------------- # Private method to translate a unix time value into a date. # my $time = shift || time; ($time =~ /^\d+$/) or ($time = time); my ($sec, $min, $hour, $day, $mon, $year, $dweek, $dyear, $tz) = localtime $time; $year = $year + 1900; $mon++; ($mon < 10) and ($mon = "0" . int $mon); ($day < 10) and ($day = "0" . int $day); return "$year-$mon-$day"; }