Gossamer Forum: Products: Links 2.0: Installation -- Unix: got a question about link spider, and it doesnt work, need help

Here is the original spider.cgi script, but how come the referes thing dont work for me

#!/usr/local/bin/perl

# Script:
# Virtual Solutions Links Spider

# Copyright:
# Copyright 1999 by Virtual Solutions. Links Spider is a modification (with permission of Fluid Dynamics) of the Fluid
# Dynamics Search Engine Version 2.0 script. The Links Spider modification is freeware and is made available at no
# cost for both personal and commercial use. However, use does not constitute legal rights for resale or
# redistribution without the expressed written permission of both Virtual Solutions and Fluid Dynamics.

# Note:
# For further details including installation instructions please go to http://www.monster-submit.com/mods02.html.
# The original comment lines have been edited out but can be found in the original script at
# ftp://ftp.xav.com/search.txt.

# Fluid Dynamics Copyright Header:
# Fluid Dynamics Search Engine, Version 2.0
# Copyright 1997, 1998 by Fluid Dynamics. Please adhere to the copyright
# notice and conditions of use, described in the attached help file and
# hosted at the URL below. For the latest version and help files, visit:
# http://www.xav.com/scripts/search/

#Edit to point to domains allowed to use this script
@referers = ('www.monster-submit.com','monster-submit.com');

use Socket;

$Rules{'Hits Per Page'} = 10;
$Rules{'Multiplier: URL'} = 4;
$Rules{'Multiplier: Title'} = 10;
$Rules{'Multiplier: Keyword'} = 10;
$Rules{'Multiplier: Description'} = 4;
$Rules{'Max Characters: URL'} = 128;
$Rules{'Max Characters: Title'} = 96;
$Rules{'Max Characters: Description'} = 384;
$Rules{'Max Characters: Auto Description'} = 150;
$Rules{'Max Characters: Keywords'} = 256;
$Rules{'Max Characters: File'} = 64000;
$Rules{'Forbid All Cap Titles'} = 1;
$Rules{'Forbid All Cap Descriptions'} = 1;
$Rules{'Crawler: Minimum WhiteSpace'} = 0.01;
$Rules{'Crawler: Max Pages Per Batch'} = 12;
$Rules{'Crawler: Max Redirects'} = 6;
$Rules{'Crawler: Days Til Refresh'} = 30;
$Rules{'Crawler: User Agent'} = 'Mozilla/4.0 (compatible: FDSE robot)';
$Rules{'Crawler: Follow Query Strings'} = 0;
$Rules{'Crawler: Rogue'} = 0;

@PromoteSites = "";
$Rules{'Promote Value'} = 20;

@IgnoredWords = ('a','about','all','an','and','any','are','as','at',
'be','been','by','can','do','find','for','from','get','have','he',
'how','htm','html','http','i','if','in','is','it','me','most','new',
'no','not','of','on','one','or','other','page','s','site',
'that','the','this','to','two','use','w','web','what','when','where',
'which','who','why','will','with','you','your');

@MonthNames = ('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec');

my %FORM = &ReadInput;
$|=1;

print <<'EOM';
Content-Type: text/html

<head>
<title>Links Spider</title>
<meta name="Robots" content="noindex">
<meta name="Robots" content="nofollow">
</head>

<table border="1" bgcolor="#FFFFFF">
<tr>
<td bgcolor="#C0C0C0">
Attention Webmasters: ALL listings are available for spidering. If you do not wish for your
site to be spidered then it will be necessary to write a
robots.txt file for the
site. Or you can include meta tags in each page between the <head> ... </head> lines:

<meta name="Robots" content="noindex"> To exclude page from being spidered
<meta name="Robots" content="nofollow"> To exclude embedded links from being
spidered

</td>
</tr>
</table>

EOM

local($check_referer) = 0;
if ($ENV{'HTTP_REFERER'}) {
foreach $referer (@referers) {
if ($ENV{'HTTP_REFERER'} =~ m|https?://([^/]*)$referer|i) {
$check_referer = 1;
last;
}
}
}
else {
$check_referer = 0;
}
if ($check_referer != 1) {
$E = "Error: Bad Referer. The form attempting to use this script resides at $ENV{'HTTP_REFERER'} which is not allowed to access this script.";
&error;
}

$|=0;
my @HITS;

if ($FORM{'URL'} ne "") {
if (defined $FORM{'URL'}) {
$FORM{'AddLink0'} = $FORM{'URL'};
}
foreach (keys %FORM) {
next unless (m!^AddLink!);
if ($FORM{$_} =~ m!^http://([^\/]+)/!) {
push(@AddressesToIndex,$FORM{$_});
}
else {
push(@AddressesToIndex,"$FORM{$_}/");
}
}
&AddURL(2,@AddressesToIndex);
}
exit;

sub ReadInput {

my $InputString = '';
my ($Name,$Value);
my %FORM = ('Mode','','Terms','','Password','','SetPassword','','CL',0,'maxhits',0);

$InputString = $ENV{'QUERY_STRING'};

foreach ((split(m!\&!,$InputString)),@ARGV) {
next unless (m!^(.*?)=(.*)$!);
($Name,$Value) = ($1,$2);
$Name =~ s!\%([a-fA-F0-9][a-fA-F0-9])!pack('C',hex($1))!eg;
$Name =~ tr!+! !;
$Value =~ tr!+! !;
$Value =~ s!\%([a-fA-F0-9][a-fA-F0-9])!pack(C,hex($1))!eg;
$FORM{$Name} = $Value;
}
return %FORM;
}

sub GetAbsoluteAddress {

my ($Link,$URL) = @_;

if (($Link =~ m!^\/!) && ($URL =~ m!^http\:\/\/([^\/]+)!i)) {
$Link = "http://$1$Link";
}
elsif (($Link =~ m!^(\w+)\:!) && ($1 !~ m!^http$!i)) {
return '';
}
elsif (($Link !~ m!^http\:\/\/!i) && ($URL =~ m!^(.*)\/!)) {
$Link = $1.'/'.$Link;
}

if ($Link =~ m!^(.*?)\#!) {
$Link = $1;
}
$Link =~ s!^HTTP\:\/\/!http\:\/\/!i;

if ($Link =~ m!^http://([^\/]+)\:80$!) {
$Link = 'http://'.$1.'/';
}
elsif ($Link =~ m!^http://([^\/]+)\:80/(.*)$!) {
$Link = 'http://'.$1.'/'.$3;
}

if ($Link =~ m!^http://([^\/]+)$!) {
$Link .= '/';
}
$Link =~ s!/\./!/!g;

while ($Link =~ m!^([^\?]+)\/([^\/|\.]+)\/\.\.\/(.*)$!) {
$Link = $1.'/'.$3;
}
return $Link;
}

sub OpenSocket {

my ($THEM,$PORT) = @_;

unless (socket(HTTP, PF_INET, SOCK_STREAM, getprotobyname('tcp'))) {
$E = "Error: Low-level socket() function failed with system error \"$!\"";
&error;
}
if ($HashIP{$THEM}) {
$HexIP = $HashIP{$THEM};
}
else {
$HexIP = inet_aton($THEM);
$HashIP{$THEM} = $HexIP;
}

if ((!($HexIP)) | | ($HexIP eq 'fail')) {
$HexIP = 'fail';
$E = "Error: Hostname $THEM does not have a DNS entry (no corresponding IP address could be found for this machine). The address may have been mistyped, the site may no longer be online, it's domain may have expired or network errors could have prevented resolution.";
&error;
}
unless (connect(HTTP, sockaddr_in($PORT,$HexIP))) {
$E = "Error: Connect() failed with system error \"$!.\" Typically connect errors involve unreachable or non-functional servers, incorrect port numbers, local DNS problems or a corrupt TCP environment";
&error;
}
select(HTTP);
$|=1;
select(STDOUT);
return 1;
}

sub GetRobotFile {

($THEM,$PORT,$RobotForbidden) = @_;
if ($RobotForbidden) {
$RobotForbidden .= '|';
}
else {
$RobotForbidden = '';
}
$RobotForbidden .= '(';
$RobotForbidden .= quotemeta("$THEM.robot");
$RobotForbidden .= ')';
unless (&OpenSocket($THEM,$PORT)) {
print "\n";
return $RobotForbidden;
}

print HTTP "GET /robots.txt HTTP/1.0\r\n";
print HTTP "Host: $THEM\r\n";
print HTTP "User-Agent: $Rules{'Crawler: User Agent'}\r\n";
if (keys %SessionCookie) {
print HTTP "Cookie: ";
my $buffer = '';
foreach (keys %SessionCookie) {
print HTTP "$buffer$_=$SessionCookie{$_}";
$buffer = '; ';
}
print HTTP "\r\n";
}
print HTTP "\r\n";

$AgentToggle = 0;
while (<HTTP> ) {
if (m!user-agent: ([^\r]+)\r?$!i) {
$ForbiddenAgent = $1;
$ForbiddenAgent =~ s!\r!!g;
$ForbiddenAgent =~ s!\n!!g;
$ForbiddenAgent = quotemeta($ForbiddenAgent);
if (($Rules{'Crawler: User Agent'} =~ m!$ForbiddenAgent!i) | | ($ForbiddenAgent eq '\*')) {
$AgentToggle = 1;
}
else {
$AgentToggle = 0;
}
}
next unless (m!disallow:\s+([^\r]+)\r?$!i);
next unless ($AgentToggle == 1);
$BadURI = $1;
$BadURI =~ s!\n!!;
$RobotForbidden .= '|('.$THEM;
$RobotForbidden .= quotemeta($BadURI);
$RobotForbidden .= ')';
}
close(HTTP);
return $RobotForbidden;
}

sub GetStringByURL {

($URL) = @_;
$HTMLText = '';
unless ($URL =~ m!^http://([\w-\.]+):?(\d*)($|/(.*))!) {
$E = "Error: Unrecognized URL format for $URL (protocol not supported?)\n";
&error;
}
$THEM = $1;
$URI = $3?$3:'/';
$URI =~ s/\#.*//;
$PORT = $2 ? $2 : 80;

if ($Rules{'Crawler: Rogue'} != 1) {
unless (($RobotForbidden) && ("$THEM.robot" =~ m!^$RobotForbidden$!i)) {
$RobotForbidden = &GetRobotFile($THEM,$PORT,$RobotForbidden);
}
if ($URL =~ m!($RobotForbidden)!i) {
$E = "Error: This server's robots.txt file forbids access to the document";
&error;
}
}

unless (&OpenSocket($THEM,$PORT)) {
$E = "Error: Cannot connect to $THEM on port $PORT - $E\n";
&error;
}

print HTTP "GET $URI HTTP/1.0\r\n";
print HTTP "Host: $THEM\r\n";
print HTTP "User-Agent: $Rules{'Crawler: User Agent'}\r\n";
if (keys %SessionCookie) {
print HTTP "Cookie: ";
my $buffer = '';
foreach (keys %SessionCookie) {
print HTTP "$buffer$_=$SessionCookie{$_}";
$buffer = '; ';
}
print HTTP "\r\n";
}
print HTTP "\r\n";

$StatusLine = 0;
$Status = '';
$Location = '';

$StatusLine = <HTTP>;

if ($StatusLine =~ m!(301|302)!) {
$Status = 'Moved';
while (<HTTP> ) {
if (m!^Set\-Cookie:\s+([^\=]+)\=([^\;]+)!i) {
$SessionCookie{$1}=$2;
}
last if (m!^\r?$!);
next unless (m!^Location\:\s+([^\r\n\s]+)!i);
$Location = &GetAbsoluteAddress($1,$URL);
}
if ($Location) {
return ($Location, 302);
}
else {
$E = "Error: Received redirect status line - $StatusLine - without a corresponding Location: header";
&error;
}
}
elsif ($StatusLine !~ m!200!) {
$Status = 'Unrecognized Response.';
$E = "Error: System did not return a normal HTTP \"200 OK\" status header. The header returned was \"$StatusLine\"";
&error;
}

while (<HTTP> ) {
if ((m!^Content\-Type\:\s+([^\r]*)\r?$!i) && (!(m!text!i))) {
print "\n";
$E = "Error: Invalid content-type header returned ($1)";
&error;
}
if (m!^Set\-Cookie:\s+([^\=]+)\=([^\;]+)!i) {
$SessionCookie{$1}=$2;
}
last if (m!^\r?$!);
}

$ExpectRedirect = 0;
while (<HTTP> ) {
if (m!(HTTP-EQUIV|NAME)\=\"?Refresh!i) {
$ExpectRedirect = 1;
}
if (($ExpectRedirect) && (m!CONTENT=\"?(\d+)!i) && ($1 > 10)) {
$ExpectRedirect = 0;
}
if (($ExpectRedirect) && (m!URL=([^\s|\"|\>]+)!i)) {
$Location = &GetAbsoluteAddress($1,$URL);
return ($Location, 302);
}
$HTMLText .= $_;
}
close(HTTP);

if ($SetSaveLinks == -1) {
$SetSaveLinks = 1;
}

if (($Rules{'Crawler: Rogue'} != 1) && ($HTMLText =~ m!<META\s+(NAME|HTTP-EQUIV)(\s+)?=(\s+)?(\")?ROBOTS(\")?\s+CONTENT(\s+)?\=([^\>]+)!i)) {
$RobotsDirectives = $7;
$NoIndex = ($RobotsDirectives =~ m!(NONE|NOINDEX)!i);
$NoFollow = ($RobotsDirectives =~ m!NOFOLLOW!i);
if ($NoIndex) {
$E = 'Error: Robots meta tag of this document forbids indexing';
&error;
}
if ($NoFollow) {
$SetSaveLinks = -1;
print "\n";
}
}

unless (length($HTMLText) > 24) {
$E = "Error: Less than 24 bytes of HTML text";
&error;
}

$NumSpaces = ($HTMLText =~ s! ! !g);
if (($NumSpaces/length($HTMLText)) < $Rules{'Crawler: Minimum WhiteSpace'}) {
$E = "Error: Suspicious content - only $NumSpaces blank spaces in " . length($HTMLText) . " characters. \n";
$E .= "This is forbidden by the 'WhiteSpace Ratio' set up in the \$Rules{} array";
&error;
}
else {
return ($URL,&RawTranslate($HTMLText));
}
}

sub RawTranslate {

$_ = shift;

tr!\n\r\t! !;
s/\&nbsp\;/ /g;

s/(\&Agrave\;|\&\#192\ Wink

/À/g;
s/(\&Aacute\;|\&\#193\ Wink

/Á/g;
s/(\&Acirc\;|\&\#194\ Wink

/Â/g;
s/(\&Atilde\;|\&\#195\ Wink

/Ã/g;
s/(\&Auml\;|\&\#196\ Wink

/Ä/g;
s/(\&Aring\;|\&\#197\ Wink

/Å/g;
s/(\&AElig\;|\&\#198\ Wink

/Æ/g;
s/(\&Ccedil\;|\&\#199\ Wink

/Ç/g;
s/(\&Egrave\;|\&\#200\ Wink

/È/g;
s/(\&Eacute\;|\&\#201\ Wink

/É/g;
s/(\&Ecirc\;|\&\#202\ Wink

/Ê/g;
s/(\&Euml\;|\&\#203\ Wink

/Ë/g;
s/(\&Igrave\;|\&\#204\ Wink

/Ì/g;
s/(\&Iacute\;|\&\#205\ Wink

/Í/g;
s/(\&Icirc\;|\&\#206\ Wink

/Î/g;
s/(\&Iuml\;|\&\#207\ Wink

/Ï/g;
s/(\&Ograve\;|\&\#210\ Wink

/Ò/g;
s/(\&Oacute\;|\&\#211\ Wink

/Ó/g;
s/(\&Ocirc\;|\&\#212\ Wink

/Ô/g;
s/(\&Otilde\;|\&\#213\ Wink

/Õ/g;
s/(\&Ouml\;|\&\#214\ Wink

/Ö/g;
s/(\&times\;|\&\#215\ Wink

/×/g;
s/(\&Oslash\;|\&\#216\ Wink

/Ø/g;
s/(\&Ugrave\;|\&\#217\ Wink

/Ù/g;
s/(\&Uacute\;|\&\#218\ Wink

/Ú/g;
s/(\&Ucirc\;|\&\#219\ Wink

/Û/g;
s/(\&Uuml\;|\&\#220\ Wink

/Ü/g;
s/(\&Yacute\;|\&\#221\ Wink

/Ý/g;
s/(\&THORN\;|\&\#222\ Wink

/Þ/g;
s/(\&szlig\;|\&\#223\ Wink

/ß/g;
s/(\&agrave\;|\&\#224\ Wink

/à/g;
s/(\&aacute\;|\&\#225\ Wink

/á/g;
s/(\&acirc\;|\&\#226\ Wink

/â/g;
s/(\&atilde\;|\&\#227\ Wink

/ã/g;
s/(\&auml\;|\&\#228\ Wink

/ä/g;
s/(\&aring\;|\&\#229\ Wink

/å/g;
s/(\&aelig\;|\&\#230\ Wink

/æ/g;
s/(\&ccedil\;|\&\#231\ Wink

/ç/g;
s/(\&egrave\;|\&\#232\ Wink

/è/g;
s/(\&eacute\;|\&\#233\ Wink

/é/g;
s/(\&ecirc\;|\&\#234\ Wink

/ê/g;
s/(\&euml\;|\&\#235\ Wink

/ë/g;
s/(\&igrave\;|\&\#236\ Wink

/ì/g;
s/(\&iacute\;|\&\#237\ Wink

/í/g;
s/(\&icirc\;|\&\#238\ Wink

/î/g;
s/(\&iuml\;|\&\#239\ Wink

/ï/g;
s/(\&eth\;|\&\#240\ Wink

/ð/g;
s/(\&ntilde\;|\&\#241\ Wink

/ñ/g;
s/(\&ograve\;|\&\#242\ Wink

/ò/g;
s/(\&oacute\;|\&\#243\ Wink

/ó/g;
s/(\&ocirc\;|\&\#244\ Wink

/ô/g;
s/(\&otilde\;|\&\#245\ Wink

/õ/g;
s/(\&ouml\;|\&\#246\ Wink

/ö/g;
s/(\&divide\;|\&\#247\ Wink

/÷/g;
s/(\&oslash\;|\&\#248\ Wink

/ø/g;
s/(\&ugrave\;|\&\#249\ Wink

/ù/g;
s/(\&uacute\;|\&\#250\ Wink

/ú/g;
s/(\&ucirc\;|\&\#251\ Wink

/û/g;
s/(\&uuml\;|\&\#252\ Wink

/ü/g;
s/(\&yacute\;|\&\#253\ Wink

/ý/g;
s/(\&thorn\;|\&\#254\ Wink

/þ/g;
s/(\&yuml\;|\&\#255\ Wink

/ÿ/g;

return $_;
}

sub CompressStrip {

$_ = shift;
$_ = ' '.$_.' ';
s!\W! !g;
s!\_! !g;
$_ = StripIgnoreWords($_);
s!\s+! !g;
return $_;
}

sub AddURL {

my ($tag,@AddressesToIndex) = (@_);
$|=1;

$SetSaveLinks = 1;
$NumRank = 0;

$NumRedirectsFollowed = 0;
$MaxAddresses = scalar @AddressesToIndex;
ADDRESS: for ($AddressIndex = 0; $AddressIndex < $MaxAddresses; $AddressIndex++) {
if ($Rules{'Crawler: Max Pages Per Batch'} <= $AddressIndex) {
push(@IndexedAddresses,'DONE');
last ADDRESS;
}
$URL = $AddressesToIndex[$AddressIndex];
if ($URL !~ m!^http://!i) {
$NumRank++;
$SpiderResults{$URL} = -1;
push(@IndexedAddresses,$URL);
next ADDRESS;
}
$OldURL = $URL;
($URL,$Text) = &GetStringByURL($URL);
if (($Text eq '302') && ($NumRedirectsFollowed < $Rules{'Crawler: Max Redirects'})) {
$NumRank++;
$SpiderResults{$OldURL} = -1;
if ($URL =~ m!http://([^\/]+)$!) {
$URL .= '/';
}
@AddressesToIndex = ('',@AddressesToIndex);
$AddressesToIndex[$AddressIndex+1] = $URL;
$MaxAddresses++;
$NumRedirectsFollowed++;
push(@IndexedAddresses,$OldURL);
next ADDRESS;
}
elsif ($Text eq '302') {
$NumRank++;
$SpiderResults{$OldURL} = -1;
push(@IndexedAddresses,$OldURL);
next ADDRESS;
}
unless ($Text) {
$NumRank++;
$SpiderResults{$URL} = -1;
push(@IndexedAddresses,$URL);
next ADDRESS;
}
$RecordLine = &MakeRecord($URL,'',$Text);
$SpiderResults{$URL} = $RecordLine;
$ByteSize = length($Text);
($DD,$MM,$YYYY) = unpack('A2A2A4',substr($RecordLine,2,8));
$NumRank++;
$Month = $MonthNames[$MM];
$UserResults{$URL} = &AdminVersion($NumRank, $URL, $Title, $Description, $ByteSize, $DD, $Month, $YYYY);
push(@IndexedAddresses,$URL);
next ADDRESS;
}

ADDRESS: foreach $URL (@IndexedAddresses) {
if ($UserResults{$URL}) {
print $UserResults{$URL};
}
}

&CompileLinks;
$LinkCount = scalar (keys %SaveLinks);

if ($LinkCount) {
$QueryString = "";
$QueryString =~ tr! !+!;

$LinkCount = 1;

$PastTime = time - (86400 * $Rules{'Crawler: Days Til Refresh'});
($UnSearched,$OutDated,$Searched,$Failed,$Checked) = (0,0,0,0,1);
foreach (reverse (sort {$SaveLinks{$b} <=> $SaveLinks{$a} | | $a cmp $b} keys %SaveLinks)) {
if ($SaveLinks{$_} == 1) {
if ($UnSearched == 0) {
$UnSearched = 1;
$Checked = 1;
}
}
elsif ($SaveLinks{$_} == 2) {
if ($Failed == 0) {
$Failed = 1;
$Checked = 0;
}
}
elsif ($SaveLinks{$_} <= $PastTime) {
if ($OutDated == 0) {
$OutDated = 1;
$Checked = 1;
}
}
else {
if ($Searched == 0) {
$Checked = 0;
$Searched = 1;
}
}
print "$_\n";
$LinkCount++;
}
}
else {
print <<"EOM";
No embedded links were found during this crawl session
EOM
}
print <<"EOM";

EOM
}

sub CompileLinks {

foreach (@SavedLinks) {
$SaveLinks{$_} = 1;
# push(@Global,"$_");
}
}

sub MakeRecord {

my ($URL, $LastModT, $sText) = @_;
$FBYTES = sprintf('%06.f',length($sText));
($Title,$Description,$sText,$Links) = &Extract_Meta($sText,$URL);
$AlphaData = ' ';
$AlphaData .= "u= $URL ";
$AlphaData .= "t= $Title ";
$AlphaData .= "d= $Description ";
$AlphaData .= 'uM='.CompressStrip($URL);
$AlphaData .= 'h='.CompressStrip($sText);
$AlphaData .= 'l='.CompressStrip($Links);
$LastModT = $LastModT ? $LastModT : time;
($DD,$MM,$YYYY) = (localtime($LastModT))[3..5];
$YYYY += 1900;
$CC = 1;
foreach (@PromoteSites) {
next unless ($URL =~ m!^$_!i);
$CC = $Rules{'Promote Value'};
last;
}
for ($CC,$DD,$MM) {
$_ = sprintf('%02.f',$_);
}
return "$CC$DD$MM$YYYY$FBYTES$AlphaData\n";
}

sub Extract_Meta {

($HTML_Text,$URL) = @_;
($Title, $Description, $Links) = ('','','');

foreach (split(m!\<A !i, $HTML_Text)) {
next unless (m!^([^\>]*)HREF(\s+)?=(\s+)?\"?([^\"\s\>]+)!i);
$ThisLink = $4;
$Links .= ' '.$ThisLink;
next unless ($SetSaveLinks == 1);
next if (($Rules{'Crawler: Follow Query Strings'} == 0) && ($ThisLink =~ m!\?!));
$ThisLink = &GetAbsoluteAddress($ThisLink,$URL);
push(@SavedLinks,$ThisLink) if ($ThisLink);
}

foreach (split(m!\<I?FRAME !i, $HTML_Text)) {
next unless (m!^([^\>]*)SRC(\s+)?=(\s+)?\"?([^\"\s\>]+)!i);
$ThisLink = $4;
$Links .= ' '.$ThisLink;
next unless ($SetSaveLinks == 1);
next if (($Rules{'Crawler: Follow Query Strings'} == 0) && ($ThisLink =~ m!\?!));
$ThisLink = &GetAbsoluteAddress($ThisLink,$URL);
push(@SavedLinks,$ThisLink) if ($ThisLink);
}

$HTML_Text .= ' | | ';

if ($HTML_Text =~ m!<TITLE.*?>(.*?)<!i) {
$Title = ' '.$1;
$HTML_Text =~ s!<TITLE.*?>.*?<\/TITLE>! !i;
$HTML_Text .= $Title x $Rules{'Multiplier: Title'};
}
elsif (($FILE) && ($FILE =~ m!([^\/]+)$!)) {
$Title = $1;
}

elsif ($URL =~ m!([^\/]+)$!) {
$Title = $1;
}
elsif ($FILE) {
$Title = $FILE;
}
elsif ($URL) {
$Title = $URL;
}
else {
$Title = 'Document';
}

if (($Rules{'Forbid All Cap Titles'}) && ($Title !~ m![a-z]!)) {
$NewTitle = '';
foreach (split(m!\s+!,$Title)) {
unless (length($_) > 1) {
$NewTitle .= $_.' ';
next
}
$NewTitle .= ' '.substr($_,0,1);
$_ = substr($_,1,(length($_)-1));
tr[A-Z][a-z];
$NewTitle .= $_;
}
$Title = $NewTitle;
}

if ($HTML_Text =~ m!.*?<META([^\>]*?)(NAME|HTTP-EQUIV)="keywords"([^\>]*?)(CONTENT|VALUE)="([^\"]+)"!i) {
$KeyWords = ' '.$5;
$HTML_Text .= $KeyWords x $Rules{'Multiplier: Keyword'};
}
if ($HTML_Text =~ m!.*?<META([^\>]*?)(NAME|HTTP-EQUIV)="description"([^\>]*?)(CONTENT|VALUE)="([^\"]+)"!i) {
$Description = ' '.$5;
$HTML_Text .= $Description x $Rules{'Multiplier: Description'};
}

$HTML_Text =~ s/<[^>]*\s+ALT\s*=\s*"(([^>"])*)"[^>]*>/ $1 /ig;

$NoScript = '';
foreach (split(m!(\<\/SCRIPT>|\<\/STYLE> )!i, $HTML_Text)) {
next unless $_;
if (m!^(.*)(\<SCRIPT|\<STYLE)!i) {
$NoScript .= ' '.$1;
}
else {
$NoScript .= ' '.$_;
}
}
$HTML_Text = $NoScript;

if ($HTML_Text =~ m!(.*)<NOFRAMES>(.*)</NOFRAMES>(.*?)!i) {
if (length($2) < 2000) {
$HTML_Text = $1.' '.$2;
}
}

$HTML_Text =~ s!<([^>]*?)>! !g;
$HTML_Text =~ s!\s+! !g;

unless ($Description) {
$tempDescription = substr($HTML_Text,0,$Rules{'Max Characters: Auto Description'});
if ($tempDescription =~ m!([^\|]*)\s+!) {
$Description = $1.'...';
}
else {
$Description = 'No description available.';
}
}

$HTML_Text =~ s!(\W|\_)! !g;
$Title =~ s!\s+! !g;
if ($Title =~ m!^ (.+)!) {
$Title = $1;
}

$Description =~ s!\s+! !g;
if ($Description =~ m!^ (.+)!) {
$Description = $1;
}

if (($Rules{'Forbid All Cap Descriptions'}) && ($Description !~ /[a-z]/)) {
$NewDescription = '';
foreach (split(/\s+/,$Description)) {
$NewDescription .= ' '.substr($_,0,1);
$_ = substr($_,1,(length($_)-1));
tr[A-Z][a-z];
$NewDescription .= $_;
}
$Description = $NewDescription;
}
return($Title,$Description,$HTML_Text,$Links);
}

sub AdminVersion {

my ($Rank,$URL,$Title,$Description,$Size,$Day,$Month,$Year) = @_;
$Size = ($Size<1500)?int($Size).' bytes' Frown

int($Size/1000)).' Kilobytes';
$wURL = webEncode($URL);
return <<"EOM";
$Title $Description
$URL - size $Size - last updated $Day $Month $Year

EOM
}

sub webEncode {

$_ = shift;
return $_ unless m!(\%|\+|\&|\s)!;
s!\%!\%25!g;
s!\+!\%2B!g;
s!\&!\%26!g;
s! !\+!g;
return $_;
}

sub StripIgnoreWords {

$_ = shift;
foreach $Ignore (@IgnoredWords) {
s! $Ignore ! !g;
}
return $_;
}

sub error {

print "<center>\n";
print "$E\n\n";

print <<"EOM";
</center>

EOM
exit;
}
exit;