Hello,
What should i edit in order to make spider.cgi (Monster-submit mod) to display description for all the pages spidered.
For your reference the following is the spider.cgi code
#!/usr/bin/perl -w
# Script:
# Virtual Solutions Links Spider
# Copyright:
# Copyright 1999 by Virtual Solutions. Links Spider is a modification (with permission of Fluid Dynamics) of the Fluid
# Dynamics Search Engine Version 2.0 script. The Links Spider modification is freeware and is made available at no
# cost for both personal and commercial use. However, use does not constitute legal rights for resale or
# redistribution without the expressed written permission of both Virtual Solutions and Fluid Dynamics.
# Note:
# For further details including installation instructions please go to http://www.monster-submit.com/mods02.html.
# The original comment lines have been edited out but can be found in the original script at
# ftp://ftp.xav.com/search.txt.
# Fluid Dynamics Copyright Header:
# Fluid Dynamics Search Engine, Version 2.0
# Copyright 1997, 1998 by Fluid Dynamics. Please adhere to the copyright
# notice and conditions of use, described in the attached help file and
# hosted at the URL below. For the latest version and help files, visit:
# http://www.xav.com/scripts/search/
#Edit to point to domains allowed to use this script
@referers = ('www.monster-submit.com','monster-submit.com');
use Socket;
$Rules{'Hits Per Page'} = 10;
$Rules{'Multiplier: URL'} = 4;
$Rules{'Multiplier: Title'} = 10;
$Rules{'Multiplier: Keyword'} = 10;
$Rules{'Multiplier: Description'} = 4;
$Rules{'Max Characters: URL'} = 128;
$Rules{'Max Characters: Title'} = 96;
$Rules{'Max Characters: Description'} = 384;
$Rules{'Max Characters: Auto Description'} = 150;
$Rules{'Max Characters: Keywords'} = 256;
$Rules{'Max Characters: File'} = 64000;
$Rules{'Forbid All Cap Titles'} = 1;
$Rules{'Forbid All Cap Descriptions'} = 1;
$Rules{'Crawler: Minimum WhiteSpace'} = 0.01;
$Rules{'Crawler: Max Pages Per Batch'} = 12;
$Rules{'Crawler: Max Redirects'} = 6;
$Rules{'Crawler: Days Til Refresh'} = 30;
$Rules{'Crawler: User Agent'} = 'Mozilla/4.0 (compatible: FDSE robot)';
$Rules{'Crawler: Follow Query Strings'} = 0;
$Rules{'Crawler: Rogue'} = 0;
@PromoteSites = "";
$Rules{'Promote Value'} = 20;
@IgnoredWords = ('a','about','all','an','and','any','are','as','at',
'be','been','by','can','do','find','for','from','get','have','he',
'how','htm','html','http','i','if','in','is','it','me','most','new',
'no','not','of','on','one','or','other','page','s','site',
'that','the','this','to','two','use','w','web','what','when','where',
'which','who','why','will','with','you','your');
@MonthNames = ('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec');
my %FORM = &ReadInput;
$|=1;
print <<'EOM';
Content-Type: text/html
<html>
<head>
<title>Links Spider</title>
<meta name="Robots" content="noindex">
<meta name="Robots" content="nofollow">
</head>
<body bgcolor="#FFFFFF" text="#000080" link="#3399FF" vlink="#3399FF" alink="#0000FF">
<table border="1" bgcolor="#FFFFFF">
<tr>
<td bgcolor="#C0C0C0"><font color="#FFFFFF">
<b>Attention Webmasters:</b> ALL listings are available for spidering. If you do not wish for your
site to be spidered then it will be necessary to write a
<a href="http://www.altavista.com/av/content/addurl_exclude.htm" target="new">robots.txt</a> file for the
site. Or you can include meta tags in each page between the <head> ... </head> lines:<p>
<b><meta name="Robots" content="noindex"></b> <i>To exclude page from being spidered</i>
<b><meta name="Robots" content="nofollow"></b> <i>To exclude embedded links from being
spidered</i><p>
</font></td>
</tr>
</table><p>
EOM
local($check_referer) = 0;
if ($ENV{'HTTP_REFERER'}) {
foreach $referer (@referers) {
if ($ENV{'HTTP_REFERER'} =~ m|https?://([^/]*)$referer|i) {
$check_referer = 1;
last;
}
}
}
else {
$check_referer = 0;
}
if ($check_referer != 1) {
$E = "Error: Bad Referer. The form attempting to use this script resides at $ENV{'HTTP_REFERER'} which is not allowed to access this script.";
&error;
}
$|=0;
my @HITS;
if ($FORM{'URL'} ne "") {
if (defined $FORM{'URL'}) {
$FORM{'AddLink0'} = $FORM{'URL'};
}
foreach (keys %FORM) {
next unless (m!^AddLink!);
if ($FORM{$_} =~ m!^http://([^\/]+)/!) {
push(@AddressesToIndex,$FORM{$_});
}
else {
push(@AddressesToIndex,"$FORM{$_}/");
}
}
&AddURL(2,@AddressesToIndex);
}
exit;
sub ReadInput {
my $InputString = '';
my ($Name,$Value);
my %FORM = ('Mode','','Terms','','Password','','SetPassword','','CL',0,'maxhits',0);
$InputString = $ENV{'QUERY_STRING'};
foreach ((split(m!\&!,$InputString)),@ARGV) {
next unless (m!^(.*?)=(.*)$!);
($Name,$Value) = ($1,$2);
$Name =~ s!\%([a-fA-F0-9][a-fA-F0-9])!pack('C',hex($1))!eg;
$Name =~ tr!+! !;
$Value =~ tr!+! !;
$Value =~ s!\%([a-fA-F0-9][a-fA-F0-9])!pack(C,hex($1))!eg;
$FORM{$Name} = $Value;
}
return %FORM;
}
sub GetAbsoluteAddress {
my ($Link,$URL) = @_;
if (($Link =~ m!^\/!) && ($URL =~ m!^http\:\/\/([^\/]+)!i)) {
$Link = "http://$1$Link";
}
elsif (($Link =~ m!^(\w+)\:!) && ($1 !~ m!^http$!i)) {
return '';
}
elsif (($Link !~ m!^http\:\/\/!i) && ($URL =~ m!^(.*)\/!)) {
$Link = $1.'/'.$Link;
}
if ($Link =~ m!^(.*?)\#!) {
$Link = $1;
}
$Link =~ s!^HTTP\:\/\/!http\:\/\/!i;
if ($Link =~ m!^http://([^\/]+)\:80$!) {
$Link = 'http://'.$1.'/';
}
elsif ($Link =~ m!^http://([^\/]+)\:80/(.*)$!) {
$Link = 'http://'.$1.'/'.$3;
}
if ($Link =~ m!^http://([^\/]+)$!) {
$Link .= '/';
}
$Link =~ s!/\./!/!g;
while ($Link =~ m!^([^\?]+)\/([^\/|\.]+)\/\.\.\/(.*)$!) {
$Link = $1.'/'.$3;
}
return $Link;
}
sub OpenSocket {
my ($THEM,$PORT) = @_;
unless (socket(HTTP, PF_INET, SOCK_STREAM, getprotobyname('tcp'))) {
$E = "Error: Low-level socket() function failed with system error \"$!\"";
&error;
}
if ($HashIP{$THEM}) {
$HexIP = $HashIP{$THEM};
}
else {
$HexIP = inet_aton($THEM);
$HashIP{$THEM} = $HexIP;
}
if ((!($HexIP)) || ($HexIP eq 'fail')) {
$HexIP = 'fail';
$E = "Error: Hostname $THEM does not have a DNS entry (no corresponding IP address could be found for this machine). The address may have been mistyped, the site may no longer be online, it's domain may have expired or network errors could have prevented resolution.";
&error;
}
unless (connect(HTTP, sockaddr_in($PORT,$HexIP))) {
$E = "Error: Connect() failed with system error \"$!.\" Typically connect errors involve unreachable or non-functional servers, incorrect port numbers, local DNS problems or a corrupt TCP environment";
&error;
}
select(HTTP);
$|=1;
select(STDOUT);
return 1;
}
sub GetRobotFile {
($THEM,$PORT,$RobotForbidden) = @_;
if ($RobotForbidden) {
$RobotForbidden .= '|';
}
else {
$RobotForbidden = '';
}
$RobotForbidden .= '(';
$RobotForbidden .= quotemeta("$THEM.robot");
$RobotForbidden .= ')';
unless (&OpenSocket($THEM,$PORT)) {
print "\n";
}
}
unless (length($HTMLText) > 24) {
$E = "Error: Less than 24 bytes of HTML text";
&error;
}
$NumSpaces = ($HTMLText =~ s! ! !g);
if (($NumSpaces/length($HTMLText)) < $Rules{'Crawler: Minimum WhiteSpace'}) {
$E = "Error: Suspicious content - only $NumSpaces blank spaces in " . length($HTMLText) . " characters. \n";
$E .= "This is forbidden by the 'WhiteSpace Ratio' set up in the \$Rules{} array";
&error;
}
else {
return ($URL,&RawTranslate($HTMLText));
}
}
sub RawTranslate {
$_ = shift;
tr!\n\r\t! !;
s/\ \;/ /g;
s/(\À\;|\&\#192\;)/À/g;
s/(\Á\;|\&\#193\;)/Á/g;
s/(\Â\;|\&\#194\;)/Â/g;
s/(\Ã\;|\&\#195\;)/Ã/g;
s/(\Ä\;|\&\#196\;)/Ä/g;
s/(\Å\;|\&\#197\;)/Å/g;
s/(\Æ\;|\&\#198\;)/Æ/g;
s/(\Ç\;|\&\#199\;)/Ç/g;
s/(\È\;|\&\#200\;)/È/g;
s/(\É\;|\&\#201\;)/É/g;
s/(\Ê\;|\&\#202\;)/Ê/g;
s/(\Ë\;|\&\#203\;)/Ë/g;
s/(\Ì\;|\&\#204\;)/Ì/g;
s/(\Í\;|\&\#205\;)/Í/g;
s/(\Î\;|\&\#206\;)/Î/g;
s/(\Ï\;|\&\#207\;)/Ï/g;
s/(\Ò\;|\&\#210\;)/Ò/g;
s/(\Ó\;|\&\#211\;)/Ó/g;
s/(\Ô\;|\&\#212\;)/Ô/g;
s/(\Õ\;|\&\#213\;)/Õ/g;
s/(\Ö\;|\&\#214\;)/Ö/g;
s/(\×\;|\&\#215\;)/×/g;
s/(\Ø\;|\&\#216\;)/Ø/g;
s/(\Ù\;|\&\#217\;)/Ù/g;
s/(\Ú\;|\&\#218\;)/Ú/g;
s/(\Û\;|\&\#219\;)/Û/g;
s/(\Ü\;|\&\#220\;)/Ü/g;
s/(\Ý\;|\&\#221\;)/Ý/g;
s/(\Þ\;|\&\#222\;)/Þ/g;
s/(\ß\;|\&\#223\;)/ß/g;
s/(\à\;|\&\#224\;)/à/g;
s/(\á\;|\&\#225\;)/á/g;
s/(\â\;|\&\#226\;)/â/g;
s/(\ã\;|\&\#227\;)/ã/g;
s/(\ä\;|\&\#228\;)/ä/g;
s/(\å\;|\&\#229\;)/å/g;
s/(\æ\;|\&\#230\;)/æ/g;
s/(\ç\;|\&\#231\;)/ç/g;
s/(\è\;|\&\#232\;)/è/g;
s/(\é\;|\&\#233\;)/é/g;
s/(\ê\;|\&\#234\;)/ê/g;
s/(\ë\;|\&\#235\;)/ë/g;
s/(\ì\;|\&\#236\;)/ì/g;
s/(\í\;|\&\#237\;)/í/g;
s/(\î\;|\&\#238\;)/î/g;
s/(\ï\;|\&\#239\;)/ï/g;
s/(\ð\;|\&\#240\;)/ð/g;
s/(\ñ\;|\&\#241\;)/ñ/g;
s/(\ò\;|\&\#242\;)/ò/g;
s/(\ó\;|\&\#243\;)/ó/g;
s/(\ô\;|\&\#244\;)/ô/g;
s/(\õ\;|\&\#245\;)/õ/g;
s/(\ö\;|\&\#246\;)/ö/g;
s/(\÷\;|\&\#247\;)/÷/g;
s/(\ø\;|\&\#248\;)/ø/g;
s/(\ù\;|\&\#249\;)/ù/g;
s/(\ú\;|\&\#250\;)/ú/g;
s/(\û\;|\&\#251\;)/û/g;
s/(\ü\;|\&\#252\;)/ü/g;
s/(\ý\;|\&\#253\;)/ý/g;
s/(\þ\;|\&\#254\;)/þ/g;
s/(\ÿ\;|\&\#255\;)/ÿ/g;
return $_;
}
sub CompressStrip {
$_ = shift;
$_ = ' '.$_.' ';
s!\W! !g;
s!\_! !g;
$_ = StripIgnoreWords($_);
s!\s+! !g;
return $_;
}
sub AddURL {
my ($tag,@AddressesToIndex) = (@_);
$|=1;
$SetSaveLinks = 1;
$NumRank = 0;
$NumRedirectsFollowed = 0;
$MaxAddresses = scalar @AddressesToIndex;
ADDRESS: for ($AddressIndex = 0; $AddressIndex < $MaxAddresses; $AddressIndex++) {
if ($Rules{'Crawler: Max Pages Per Batch'} <= $AddressIndex) {
push(@IndexedAddresses,'DONE');
last ADDRESS;
}
$URL = $AddressesToIndex[$AddressIndex];
if ($URL !~ m!^http://!i) {
$NumRank++;
$SpiderResults{$URL} = -1;
push(@IndexedAddresses,$URL);
next ADDRESS;
}
$OldURL = $URL;
($URL,$Text) = &GetStringByURL($URL);
if (($Text eq '302') && ($NumRedirectsFollowed < $Rules{'Crawler: Max Redirects'})) {
$NumRank++;
$SpiderResults{$OldURL} = -1;
if ($URL =~ m!http://([^\/]+)$!) {
$URL .= '/';
}
@AddressesToIndex = ('',@AddressesToIndex);
$AddressesToIndex[$AddressIndex+1] = $URL;
$MaxAddresses++;
$NumRedirectsFollowed++;
push(@IndexedAddresses,$OldURL);
next ADDRESS;
}
elsif ($Text eq '302') {
$NumRank++;
$SpiderResults{$OldURL} = -1;
push(@IndexedAddresses,$OldURL);
next ADDRESS;
}
unless ($Text) {
$NumRank++;
$SpiderResults{$URL} = -1;
push(@IndexedAddresses,$URL);
next ADDRESS;
}
$RecordLine = &MakeRecord($URL,'',$Text);
$SpiderResults{$URL} = $RecordLine;
$ByteSize = length($Text);
($DD,$MM,$YYYY) = unpack('A2A2A4',substr($RecordLine,2,8));
$NumRank++;
$Month = $MonthNames[$MM];
$UserResults{$URL} = &AdminVersion($NumRank, $URL, $Title, $Description, $ByteSize, $DD, $Month, $YYYY);
push(@IndexedAddresses,$URL);
next ADDRESS;
}
ADDRESS: foreach $URL (@IndexedAddresses) {
if ($UserResults{$URL}) {
print $UserResults{$URL};
}
}
&CompileLinks;
$LinkCount = scalar (keys %SaveLinks);
if ($LinkCount) {
$QueryString = "";
$QueryString =~ tr! !+!;
$LinkCount = 1;
$PastTime = time - (86400 * $Rules{'Crawler: Days Til Refresh'});
($UnSearched,$OutDated,$Searched,$Failed,$Checked) = (0,0,0,0,1);
foreach (reverse (sort {$SaveLinks{$b} <=> $SaveLinks{$a} || $a cmp $b} keys %SaveLinks)) {
if ($SaveLinks{$_} == 1) {
if ($UnSearched == 0) {
$UnSearched = 1;
$Checked = 1;
}
}
elsif ($SaveLinks{$_} == 2) {
if ($Failed == 0) {
$Failed = 1;
$Checked = 0;
}
}
elsif ($SaveLinks{$_} <= $PastTime) {
if ($OutDated == 0) {
$OutDated = 1;
$Checked = 1;
}
}
else {
if ($Searched == 0) {
$Checked = 0;
$Searched = 1;
}
}
print "<a href=\"$_$QueryString\">$_</a>
\n";
$LinkCount++;
}
}
else {
print <<"EOM";
No embedded links were found during this crawl session
EOM
}
print <<"EOM";
</body>
</html>
EOM
}
sub CompileLinks {
foreach (@SavedLinks) {
$SaveLinks{$_} = 1;
# push(@Global,"$_");
}
}
sub MakeRecord {
my ($URL, $LastModT, $sText) = @_;
$FBYTES = sprintf('.f',length($sText));
($Title,$Description,$sText,$Links) = &Extract_Meta($sText,$URL);
$AlphaData = ' ';
$AlphaData .= "u= $URL ";
$AlphaData .= "t= $Title ";
$AlphaData .= "d= $Description ";
$AlphaData .= 'uM='.CompressStrip($URL);
$AlphaData .= 'h='.CompressStrip($sText);
$AlphaData .= 'l='.CompressStrip($Links);
$LastModT = $LastModT ? $LastModT : time;
($DD,$MM,$YYYY) = (localtime($LastModT))[3..5];
$YYYY += 1900;
$CC = 1;
foreach (@PromoteSites) {
next unless ($URL =~ m!^$_!i);
$CC = $Rules{'Promote Value'};
last;
}
for ($CC,$DD,$MM) {
$_ = sprintf('.f',$_);
}
return "$CC$DD$MM$YYYY$FBYTES$AlphaData\n";
}
sub Extract_Meta {
($HTML_Text,$URL) = @_;
($Title, $Description, $Links) = ('','','');
foreach (split(m!\<A !i, $HTML_Text)) {
next unless (m!^([^\>]*)HREF(\s+)?=(\s+)?\"?([^\"\s\>]+)!i);
$ThisLink = $4;
$Links .= ' '.$ThisLink;
next unless ($SetSaveLinks == 1);
next if (($Rules{'Crawler: Follow Query Strings'} == 0) && ($ThisLink =~ m!\?!));
$ThisLink = &GetAbsoluteAddress($ThisLink,$URL);
push(@SavedLinks,$ThisLink) if ($ThisLink);
}
foreach (split(m!\<I?FRAME !i, $HTML_Text)) {
next unless (m!^([^\>]*)SRC(\s+)?=(\s+)?\"?([^\"\s\>]+)!i);
$ThisLink = $4;
$Links .= ' '.$ThisLink;
next unless ($SetSaveLinks == 1);
next if (($Rules{'Crawler: Follow Query Strings'} == 0) && ($ThisLink =~ m!\?!));
$ThisLink = &GetAbsoluteAddress($ThisLink,$URL);
push(@SavedLinks,$ThisLink) if ($ThisLink);
}
$HTML_Text .= ' || ';
if ($HTML_Text =~ m!<TITLE.*?>(.*?)<!i) {
$Title = ' '.$1;
$HTML_Text =~ s!<TITLE.*?>.*?<\/TITLE>! !i;
$HTML_Text .= $Title x $Rules{'Multiplier: Title'};
}
elsif (($FILE) && ($FILE =~ m!([^\/]+)$!)) {
$Title = $1;
}
elsif ($URL =~ m!([^\/]+)$!) {
$Title = $1;
}
elsif ($FILE) {
$Title = $FILE;
}
elsif ($URL) {
$Title = $URL;
}
else {
$Title = 'Document';
}
if (($Rules{'Forbid All Cap Titles'}) && ($Title !~ m![a-z]!)) {
$NewTitle = '';
foreach (split(m!\s+!,$Title)) {
unless (length($_) > 1) {
$NewTitle .= $_.' ';
next
}
$NewTitle .= ' '.substr($_,0,1);
$_ = substr($_,1,(length($_)-1));
tr[A-Z][a-z];
$NewTitle .= $_;
}
$Title = $NewTitle;
}
if ($HTML_Text =~ m!.*?<META([^\>]*?)(NAME|HTTP-EQUIV)="keywords"([^\>]*?)(CONTENT|VALUE)="([^\"]+)"!i) {
$KeyWords = ' '.$5;
$HTML_Text .= $KeyWords x $Rules{'Multiplier: Keyword'};
}
if ($HTML_Text =~ m!.*?<META([^\>]*?)(NAME|HTTP-EQUIV)="description"([^\>]*?)(CONTENT|VALUE)="([^\"]+)"!i) {
$Description = ' '.$5;
$HTML_Text .= $Description x $Rules{'Multiplier: Description'};
}
$HTML_Text =~ s/<[^>]*\s+ALT\s*=\s*"(([^>"])*)"[^>]*>/ $1 /ig;
$NoScript = '';
foreach (split(m!(\<\/SCRIPT>|\<\/STYLE>)!i, $HTML_Text)) {
next unless $_;
if (m!^(.*)(\<SCRIPT|\<STYLE)!i) {
$NoScript .= ' '.$1;
}
else {
$NoScript .= ' '.$_;
}
}
$HTML_Text = $NoScript;
if ($HTML_Text =~ m!(.*)<NOFRAMES>(.*)</NOFRAMES>(.*?)!i) {
if (length($2) < 2000) {
$HTML_Text = $1.' '.$2;
}
}
$HTML_Text =~ s!<([^>]*?)>! !g;
$HTML_Text =~ s!\s+! !g;
unless ($Description) {
$tempDescription = substr($HTML_Text,0,$Rules{'Max Characters: Auto Description'});
if ($tempDescription =~ m!([^\|]*)\s+!) {
$Description = $1.'...';
}
else {
$Description = 'No description available.';
}
}
$HTML_Text =~ s!(\W|\_)! !g;
$Title =~ s!\s+! !g;
if ($Title =~ m!^ (.+)!) {
$Title = $1;
}
$Description =~ s!\s+! !g;
if ($Description =~ m!^ (.+)!) {
$Description = $1;
}
if (($Rules{'Forbid All Cap Descriptions'}) && ($Description !~ /[a-z]/)) {
$NewDescription = '';
foreach (split(/\s+/,$Description)) {
$NewDescription .= ' '.substr($_,0,1);
$_ = substr($_,1,(length($_)-1));
tr[A-Z][a-z];
$NewDescription .= $_;
}
$Description = $NewDescription;
}
return($Title,$Description,$HTML_Text,$Links);
}
sub AdminVersion {
my ($Rank,$URL,$Title,$Description,$Size,$Day,$Month,$Year) = @_;
$Size = ($Size<1500)?int($Size).' bytes':(int($Size/1000)).' Kilobytes';
$wURL = webEncode($URL);
return <<"EOM";
<a href="$wURL"><b>$Title</b></a> $Description
<i><a href="$wURL">$URL</a><small> - size $Size - last updated $Day $Month $Year</small></i><p>
EOM
}
sub webEncode {
$_ = shift;
return $_ unless m!(\%|\+|\&|\s)!;
s!\%!\%!g;
s!\+!\+!g;
s!\&!\&!g;
s! !\+!g;
return $_;
}
sub StripIgnoreWords {
$_ = shift;
foreach $Ignore (@IgnoredWords) {
s! $Ignore ! !g;
}
return $_;
}
sub error {
print "<center>\n";
print "$E\n\n";
print <<"EOM";
</center>
</body>
</html>
EOM
exit;
}
exit;
-----------------------------------------------
Best Regs
JackofNone
What should i edit in order to make spider.cgi (Monster-submit mod) to display description for all the pages spidered.
For your reference the following is the spider.cgi code
#!/usr/bin/perl -w
# Script:
# Virtual Solutions Links Spider
# Copyright:
# Copyright 1999 by Virtual Solutions. Links Spider is a modification (with permission of Fluid Dynamics) of the Fluid
# Dynamics Search Engine Version 2.0 script. The Links Spider modification is freeware and is made available at no
# cost for both personal and commercial use. However, use does not constitute legal rights for resale or
# redistribution without the expressed written permission of both Virtual Solutions and Fluid Dynamics.
# Note:
# For further details including installation instructions please go to http://www.monster-submit.com/mods02.html.
# The original comment lines have been edited out but can be found in the original script at
# ftp://ftp.xav.com/search.txt.
# Fluid Dynamics Copyright Header:
# Fluid Dynamics Search Engine, Version 2.0
# Copyright 1997, 1998 by Fluid Dynamics. Please adhere to the copyright
# notice and conditions of use, described in the attached help file and
# hosted at the URL below. For the latest version and help files, visit:
# http://www.xav.com/scripts/search/
#Edit to point to domains allowed to use this script
@referers = ('www.monster-submit.com','monster-submit.com');
use Socket;
$Rules{'Hits Per Page'} = 10;
$Rules{'Multiplier: URL'} = 4;
$Rules{'Multiplier: Title'} = 10;
$Rules{'Multiplier: Keyword'} = 10;
$Rules{'Multiplier: Description'} = 4;
$Rules{'Max Characters: URL'} = 128;
$Rules{'Max Characters: Title'} = 96;
$Rules{'Max Characters: Description'} = 384;
$Rules{'Max Characters: Auto Description'} = 150;
$Rules{'Max Characters: Keywords'} = 256;
$Rules{'Max Characters: File'} = 64000;
$Rules{'Forbid All Cap Titles'} = 1;
$Rules{'Forbid All Cap Descriptions'} = 1;
$Rules{'Crawler: Minimum WhiteSpace'} = 0.01;
$Rules{'Crawler: Max Pages Per Batch'} = 12;
$Rules{'Crawler: Max Redirects'} = 6;
$Rules{'Crawler: Days Til Refresh'} = 30;
$Rules{'Crawler: User Agent'} = 'Mozilla/4.0 (compatible: FDSE robot)';
$Rules{'Crawler: Follow Query Strings'} = 0;
$Rules{'Crawler: Rogue'} = 0;
@PromoteSites = "";
$Rules{'Promote Value'} = 20;
@IgnoredWords = ('a','about','all','an','and','any','are','as','at',
'be','been','by','can','do','find','for','from','get','have','he',
'how','htm','html','http','i','if','in','is','it','me','most','new',
'no','not','of','on','one','or','other','page','s','site',
'that','the','this','to','two','use','w','web','what','when','where',
'which','who','why','will','with','you','your');
@MonthNames = ('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec');
my %FORM = &ReadInput;
$|=1;
print <<'EOM';
Content-Type: text/html
<html>
<head>
<title>Links Spider</title>
<meta name="Robots" content="noindex">
<meta name="Robots" content="nofollow">
</head>
<body bgcolor="#FFFFFF" text="#000080" link="#3399FF" vlink="#3399FF" alink="#0000FF">
<table border="1" bgcolor="#FFFFFF">
<tr>
<td bgcolor="#C0C0C0"><font color="#FFFFFF">
<b>Attention Webmasters:</b> ALL listings are available for spidering. If you do not wish for your
site to be spidered then it will be necessary to write a
<a href="http://www.altavista.com/av/content/addurl_exclude.htm" target="new">robots.txt</a> file for the
site. Or you can include meta tags in each page between the <head> ... </head> lines:<p>
<b><meta name="Robots" content="noindex"></b> <i>To exclude page from being spidered</i>
<b><meta name="Robots" content="nofollow"></b> <i>To exclude embedded links from being
spidered</i><p>
</font></td>
</tr>
</table><p>
EOM
local($check_referer) = 0;
if ($ENV{'HTTP_REFERER'}) {
foreach $referer (@referers) {
if ($ENV{'HTTP_REFERER'} =~ m|https?://([^/]*)$referer|i) {
$check_referer = 1;
last;
}
}
}
else {
$check_referer = 0;
}
if ($check_referer != 1) {
$E = "Error: Bad Referer. The form attempting to use this script resides at $ENV{'HTTP_REFERER'} which is not allowed to access this script.";
&error;
}
$|=0;
my @HITS;
if ($FORM{'URL'} ne "") {
if (defined $FORM{'URL'}) {
$FORM{'AddLink0'} = $FORM{'URL'};
}
foreach (keys %FORM) {
next unless (m!^AddLink!);
if ($FORM{$_} =~ m!^http://([^\/]+)/!) {
push(@AddressesToIndex,$FORM{$_});
}
else {
push(@AddressesToIndex,"$FORM{$_}/");
}
}
&AddURL(2,@AddressesToIndex);
}
exit;
sub ReadInput {
my $InputString = '';
my ($Name,$Value);
my %FORM = ('Mode','','Terms','','Password','','SetPassword','','CL',0,'maxhits',0);
$InputString = $ENV{'QUERY_STRING'};
foreach ((split(m!\&!,$InputString)),@ARGV) {
next unless (m!^(.*?)=(.*)$!);
($Name,$Value) = ($1,$2);
$Name =~ s!\%([a-fA-F0-9][a-fA-F0-9])!pack('C',hex($1))!eg;
$Name =~ tr!+! !;
$Value =~ tr!+! !;
$Value =~ s!\%([a-fA-F0-9][a-fA-F0-9])!pack(C,hex($1))!eg;
$FORM{$Name} = $Value;
}
return %FORM;
}
sub GetAbsoluteAddress {
my ($Link,$URL) = @_;
if (($Link =~ m!^\/!) && ($URL =~ m!^http\:\/\/([^\/]+)!i)) {
$Link = "http://$1$Link";
}
elsif (($Link =~ m!^(\w+)\:!) && ($1 !~ m!^http$!i)) {
return '';
}
elsif (($Link !~ m!^http\:\/\/!i) && ($URL =~ m!^(.*)\/!)) {
$Link = $1.'/'.$Link;
}
if ($Link =~ m!^(.*?)\#!) {
$Link = $1;
}
$Link =~ s!^HTTP\:\/\/!http\:\/\/!i;
if ($Link =~ m!^http://([^\/]+)\:80$!) {
$Link = 'http://'.$1.'/';
}
elsif ($Link =~ m!^http://([^\/]+)\:80/(.*)$!) {
$Link = 'http://'.$1.'/'.$3;
}
if ($Link =~ m!^http://([^\/]+)$!) {
$Link .= '/';
}
$Link =~ s!/\./!/!g;
while ($Link =~ m!^([^\?]+)\/([^\/|\.]+)\/\.\.\/(.*)$!) {
$Link = $1.'/'.$3;
}
return $Link;
}
sub OpenSocket {
my ($THEM,$PORT) = @_;
unless (socket(HTTP, PF_INET, SOCK_STREAM, getprotobyname('tcp'))) {
$E = "Error: Low-level socket() function failed with system error \"$!\"";
&error;
}
if ($HashIP{$THEM}) {
$HexIP = $HashIP{$THEM};
}
else {
$HexIP = inet_aton($THEM);
$HashIP{$THEM} = $HexIP;
}
if ((!($HexIP)) || ($HexIP eq 'fail')) {
$HexIP = 'fail';
$E = "Error: Hostname $THEM does not have a DNS entry (no corresponding IP address could be found for this machine). The address may have been mistyped, the site may no longer be online, it's domain may have expired or network errors could have prevented resolution.";
&error;
}
unless (connect(HTTP, sockaddr_in($PORT,$HexIP))) {
$E = "Error: Connect() failed with system error \"$!.\" Typically connect errors involve unreachable or non-functional servers, incorrect port numbers, local DNS problems or a corrupt TCP environment";
&error;
}
select(HTTP);
$|=1;
select(STDOUT);
return 1;
}
sub GetRobotFile {
($THEM,$PORT,$RobotForbidden) = @_;
if ($RobotForbidden) {
$RobotForbidden .= '|';
}
else {
$RobotForbidden = '';
}
$RobotForbidden .= '(';
$RobotForbidden .= quotemeta("$THEM.robot");
$RobotForbidden .= ')';
unless (&OpenSocket($THEM,$PORT)) {
print "\n";
}
}
unless (length($HTMLText) > 24) {
$E = "Error: Less than 24 bytes of HTML text";
&error;
}
$NumSpaces = ($HTMLText =~ s! ! !g);
if (($NumSpaces/length($HTMLText)) < $Rules{'Crawler: Minimum WhiteSpace'}) {
$E = "Error: Suspicious content - only $NumSpaces blank spaces in " . length($HTMLText) . " characters. \n";
$E .= "This is forbidden by the 'WhiteSpace Ratio' set up in the \$Rules{} array";
&error;
}
else {
return ($URL,&RawTranslate($HTMLText));
}
}
sub RawTranslate {
$_ = shift;
tr!\n\r\t! !;
s/\ \;/ /g;
s/(\À\;|\&\#192\;)/À/g;
s/(\Á\;|\&\#193\;)/Á/g;
s/(\Â\;|\&\#194\;)/Â/g;
s/(\Ã\;|\&\#195\;)/Ã/g;
s/(\Ä\;|\&\#196\;)/Ä/g;
s/(\Å\;|\&\#197\;)/Å/g;
s/(\Æ\;|\&\#198\;)/Æ/g;
s/(\Ç\;|\&\#199\;)/Ç/g;
s/(\È\;|\&\#200\;)/È/g;
s/(\É\;|\&\#201\;)/É/g;
s/(\Ê\;|\&\#202\;)/Ê/g;
s/(\Ë\;|\&\#203\;)/Ë/g;
s/(\Ì\;|\&\#204\;)/Ì/g;
s/(\Í\;|\&\#205\;)/Í/g;
s/(\Î\;|\&\#206\;)/Î/g;
s/(\Ï\;|\&\#207\;)/Ï/g;
s/(\Ò\;|\&\#210\;)/Ò/g;
s/(\Ó\;|\&\#211\;)/Ó/g;
s/(\Ô\;|\&\#212\;)/Ô/g;
s/(\Õ\;|\&\#213\;)/Õ/g;
s/(\Ö\;|\&\#214\;)/Ö/g;
s/(\×\;|\&\#215\;)/×/g;
s/(\Ø\;|\&\#216\;)/Ø/g;
s/(\Ù\;|\&\#217\;)/Ù/g;
s/(\Ú\;|\&\#218\;)/Ú/g;
s/(\Û\;|\&\#219\;)/Û/g;
s/(\Ü\;|\&\#220\;)/Ü/g;
s/(\Ý\;|\&\#221\;)/Ý/g;
s/(\Þ\;|\&\#222\;)/Þ/g;
s/(\ß\;|\&\#223\;)/ß/g;
s/(\à\;|\&\#224\;)/à/g;
s/(\á\;|\&\#225\;)/á/g;
s/(\â\;|\&\#226\;)/â/g;
s/(\ã\;|\&\#227\;)/ã/g;
s/(\ä\;|\&\#228\;)/ä/g;
s/(\å\;|\&\#229\;)/å/g;
s/(\æ\;|\&\#230\;)/æ/g;
s/(\ç\;|\&\#231\;)/ç/g;
s/(\è\;|\&\#232\;)/è/g;
s/(\é\;|\&\#233\;)/é/g;
s/(\ê\;|\&\#234\;)/ê/g;
s/(\ë\;|\&\#235\;)/ë/g;
s/(\ì\;|\&\#236\;)/ì/g;
s/(\í\;|\&\#237\;)/í/g;
s/(\î\;|\&\#238\;)/î/g;
s/(\ï\;|\&\#239\;)/ï/g;
s/(\ð\;|\&\#240\;)/ð/g;
s/(\ñ\;|\&\#241\;)/ñ/g;
s/(\ò\;|\&\#242\;)/ò/g;
s/(\ó\;|\&\#243\;)/ó/g;
s/(\ô\;|\&\#244\;)/ô/g;
s/(\õ\;|\&\#245\;)/õ/g;
s/(\ö\;|\&\#246\;)/ö/g;
s/(\÷\;|\&\#247\;)/÷/g;
s/(\ø\;|\&\#248\;)/ø/g;
s/(\ù\;|\&\#249\;)/ù/g;
s/(\ú\;|\&\#250\;)/ú/g;
s/(\û\;|\&\#251\;)/û/g;
s/(\ü\;|\&\#252\;)/ü/g;
s/(\ý\;|\&\#253\;)/ý/g;
s/(\þ\;|\&\#254\;)/þ/g;
s/(\ÿ\;|\&\#255\;)/ÿ/g;
return $_;
}
sub CompressStrip {
$_ = shift;
$_ = ' '.$_.' ';
s!\W! !g;
s!\_! !g;
$_ = StripIgnoreWords($_);
s!\s+! !g;
return $_;
}
sub AddURL {
my ($tag,@AddressesToIndex) = (@_);
$|=1;
$SetSaveLinks = 1;
$NumRank = 0;
$NumRedirectsFollowed = 0;
$MaxAddresses = scalar @AddressesToIndex;
ADDRESS: for ($AddressIndex = 0; $AddressIndex < $MaxAddresses; $AddressIndex++) {
if ($Rules{'Crawler: Max Pages Per Batch'} <= $AddressIndex) {
push(@IndexedAddresses,'DONE');
last ADDRESS;
}
$URL = $AddressesToIndex[$AddressIndex];
if ($URL !~ m!^http://!i) {
$NumRank++;
$SpiderResults{$URL} = -1;
push(@IndexedAddresses,$URL);
next ADDRESS;
}
$OldURL = $URL;
($URL,$Text) = &GetStringByURL($URL);
if (($Text eq '302') && ($NumRedirectsFollowed < $Rules{'Crawler: Max Redirects'})) {
$NumRank++;
$SpiderResults{$OldURL} = -1;
if ($URL =~ m!http://([^\/]+)$!) {
$URL .= '/';
}
@AddressesToIndex = ('',@AddressesToIndex);
$AddressesToIndex[$AddressIndex+1] = $URL;
$MaxAddresses++;
$NumRedirectsFollowed++;
push(@IndexedAddresses,$OldURL);
next ADDRESS;
}
elsif ($Text eq '302') {
$NumRank++;
$SpiderResults{$OldURL} = -1;
push(@IndexedAddresses,$OldURL);
next ADDRESS;
}
unless ($Text) {
$NumRank++;
$SpiderResults{$URL} = -1;
push(@IndexedAddresses,$URL);
next ADDRESS;
}
$RecordLine = &MakeRecord($URL,'',$Text);
$SpiderResults{$URL} = $RecordLine;
$ByteSize = length($Text);
($DD,$MM,$YYYY) = unpack('A2A2A4',substr($RecordLine,2,8));
$NumRank++;
$Month = $MonthNames[$MM];
$UserResults{$URL} = &AdminVersion($NumRank, $URL, $Title, $Description, $ByteSize, $DD, $Month, $YYYY);
push(@IndexedAddresses,$URL);
next ADDRESS;
}
ADDRESS: foreach $URL (@IndexedAddresses) {
if ($UserResults{$URL}) {
print $UserResults{$URL};
}
}
&CompileLinks;
$LinkCount = scalar (keys %SaveLinks);
if ($LinkCount) {
$QueryString = "";
$QueryString =~ tr! !+!;
$LinkCount = 1;
$PastTime = time - (86400 * $Rules{'Crawler: Days Til Refresh'});
($UnSearched,$OutDated,$Searched,$Failed,$Checked) = (0,0,0,0,1);
foreach (reverse (sort {$SaveLinks{$b} <=> $SaveLinks{$a} || $a cmp $b} keys %SaveLinks)) {
if ($SaveLinks{$_} == 1) {
if ($UnSearched == 0) {
$UnSearched = 1;
$Checked = 1;
}
}
elsif ($SaveLinks{$_} == 2) {
if ($Failed == 0) {
$Failed = 1;
$Checked = 0;
}
}
elsif ($SaveLinks{$_} <= $PastTime) {
if ($OutDated == 0) {
$OutDated = 1;
$Checked = 1;
}
}
else {
if ($Searched == 0) {
$Checked = 0;
$Searched = 1;
}
}
print "<a href=\"$_$QueryString\">$_</a>
\n";
$LinkCount++;
}
}
else {
print <<"EOM";
No embedded links were found during this crawl session
EOM
}
print <<"EOM";
</body>
</html>
EOM
}
sub CompileLinks {
foreach (@SavedLinks) {
$SaveLinks{$_} = 1;
# push(@Global,"$_");
}
}
sub MakeRecord {
my ($URL, $LastModT, $sText) = @_;
$FBYTES = sprintf('.f',length($sText));
($Title,$Description,$sText,$Links) = &Extract_Meta($sText,$URL);
$AlphaData = ' ';
$AlphaData .= "u= $URL ";
$AlphaData .= "t= $Title ";
$AlphaData .= "d= $Description ";
$AlphaData .= 'uM='.CompressStrip($URL);
$AlphaData .= 'h='.CompressStrip($sText);
$AlphaData .= 'l='.CompressStrip($Links);
$LastModT = $LastModT ? $LastModT : time;
($DD,$MM,$YYYY) = (localtime($LastModT))[3..5];
$YYYY += 1900;
$CC = 1;
foreach (@PromoteSites) {
next unless ($URL =~ m!^$_!i);
$CC = $Rules{'Promote Value'};
last;
}
for ($CC,$DD,$MM) {
$_ = sprintf('.f',$_);
}
return "$CC$DD$MM$YYYY$FBYTES$AlphaData\n";
}
sub Extract_Meta {
($HTML_Text,$URL) = @_;
($Title, $Description, $Links) = ('','','');
foreach (split(m!\<A !i, $HTML_Text)) {
next unless (m!^([^\>]*)HREF(\s+)?=(\s+)?\"?([^\"\s\>]+)!i);
$ThisLink = $4;
$Links .= ' '.$ThisLink;
next unless ($SetSaveLinks == 1);
next if (($Rules{'Crawler: Follow Query Strings'} == 0) && ($ThisLink =~ m!\?!));
$ThisLink = &GetAbsoluteAddress($ThisLink,$URL);
push(@SavedLinks,$ThisLink) if ($ThisLink);
}
foreach (split(m!\<I?FRAME !i, $HTML_Text)) {
next unless (m!^([^\>]*)SRC(\s+)?=(\s+)?\"?([^\"\s\>]+)!i);
$ThisLink = $4;
$Links .= ' '.$ThisLink;
next unless ($SetSaveLinks == 1);
next if (($Rules{'Crawler: Follow Query Strings'} == 0) && ($ThisLink =~ m!\?!));
$ThisLink = &GetAbsoluteAddress($ThisLink,$URL);
push(@SavedLinks,$ThisLink) if ($ThisLink);
}
$HTML_Text .= ' || ';
if ($HTML_Text =~ m!<TITLE.*?>(.*?)<!i) {
$Title = ' '.$1;
$HTML_Text =~ s!<TITLE.*?>.*?<\/TITLE>! !i;
$HTML_Text .= $Title x $Rules{'Multiplier: Title'};
}
elsif (($FILE) && ($FILE =~ m!([^\/]+)$!)) {
$Title = $1;
}
elsif ($URL =~ m!([^\/]+)$!) {
$Title = $1;
}
elsif ($FILE) {
$Title = $FILE;
}
elsif ($URL) {
$Title = $URL;
}
else {
$Title = 'Document';
}
if (($Rules{'Forbid All Cap Titles'}) && ($Title !~ m![a-z]!)) {
$NewTitle = '';
foreach (split(m!\s+!,$Title)) {
unless (length($_) > 1) {
$NewTitle .= $_.' ';
next
}
$NewTitle .= ' '.substr($_,0,1);
$_ = substr($_,1,(length($_)-1));
tr[A-Z][a-z];
$NewTitle .= $_;
}
$Title = $NewTitle;
}
if ($HTML_Text =~ m!.*?<META([^\>]*?)(NAME|HTTP-EQUIV)="keywords"([^\>]*?)(CONTENT|VALUE)="([^\"]+)"!i) {
$KeyWords = ' '.$5;
$HTML_Text .= $KeyWords x $Rules{'Multiplier: Keyword'};
}
if ($HTML_Text =~ m!.*?<META([^\>]*?)(NAME|HTTP-EQUIV)="description"([^\>]*?)(CONTENT|VALUE)="([^\"]+)"!i) {
$Description = ' '.$5;
$HTML_Text .= $Description x $Rules{'Multiplier: Description'};
}
$HTML_Text =~ s/<[^>]*\s+ALT\s*=\s*"(([^>"])*)"[^>]*>/ $1 /ig;
$NoScript = '';
foreach (split(m!(\<\/SCRIPT>|\<\/STYLE>)!i, $HTML_Text)) {
next unless $_;
if (m!^(.*)(\<SCRIPT|\<STYLE)!i) {
$NoScript .= ' '.$1;
}
else {
$NoScript .= ' '.$_;
}
}
$HTML_Text = $NoScript;
if ($HTML_Text =~ m!(.*)<NOFRAMES>(.*)</NOFRAMES>(.*?)!i) {
if (length($2) < 2000) {
$HTML_Text = $1.' '.$2;
}
}
$HTML_Text =~ s!<([^>]*?)>! !g;
$HTML_Text =~ s!\s+! !g;
unless ($Description) {
$tempDescription = substr($HTML_Text,0,$Rules{'Max Characters: Auto Description'});
if ($tempDescription =~ m!([^\|]*)\s+!) {
$Description = $1.'...';
}
else {
$Description = 'No description available.';
}
}
$HTML_Text =~ s!(\W|\_)! !g;
$Title =~ s!\s+! !g;
if ($Title =~ m!^ (.+)!) {
$Title = $1;
}
$Description =~ s!\s+! !g;
if ($Description =~ m!^ (.+)!) {
$Description = $1;
}
if (($Rules{'Forbid All Cap Descriptions'}) && ($Description !~ /[a-z]/)) {
$NewDescription = '';
foreach (split(/\s+/,$Description)) {
$NewDescription .= ' '.substr($_,0,1);
$_ = substr($_,1,(length($_)-1));
tr[A-Z][a-z];
$NewDescription .= $_;
}
$Description = $NewDescription;
}
return($Title,$Description,$HTML_Text,$Links);
}
sub AdminVersion {
my ($Rank,$URL,$Title,$Description,$Size,$Day,$Month,$Year) = @_;
$Size = ($Size<1500)?int($Size).' bytes':(int($Size/1000)).' Kilobytes';
$wURL = webEncode($URL);
return <<"EOM";
<a href="$wURL"><b>$Title</b></a> $Description
<i><a href="$wURL">$URL</a><small> - size $Size - last updated $Day $Month $Year</small></i><p>
EOM
}
sub webEncode {
$_ = shift;
return $_ unless m!(\%|\+|\&|\s)!;
s!\%!\%!g;
s!\+!\+!g;
s!\&!\&!g;
s! !\+!g;
return $_;
}
sub StripIgnoreWords {
$_ = shift;
foreach $Ignore (@IgnoredWords) {
s! $Ignore ! !g;
}
return $_;
}
sub error {
print "<center>\n";
print "$E\n\n";
print <<"EOM";
</center>
</body>
</html>
EOM
exit;
}
exit;
-----------------------------------------------
Best Regs
JackofNone