Ok, I have put up with HTML::LinkExtor for long enough. The current code just seems to be too tempermental;
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
my $ua = new LWP::UserAgent;
my $url = 'http://www.test.com';
# Set up a callback that collect image links
my @urls = ();
my ($p, $res);
sub callback2 {
my($tag, %attr) = @_;
return if $tag ne 'a'; # we only look closer at <img ...>
push(@urls, values %attr);
}
# Make the parser. Unfortunately, we don't know the base yet
# (it might be diffent from $url)
$p = HTML::LinkExtor->new(\&callback2);
# Request document and parse it as it arrives
$res = $ua->request(HTTP::Request->new(GET => $url),
sub {$p->parse($_[0])}) || die $!;
# Expand all image URLs to absolute ones
my $base = $res->base;
@urls = map { $_ = url($_, $base)->abs; } @urls;
print $IN->header();
print join("<BR>",@urls);
}
Does anyone have a reasonably reliable method to grab URL's from a page?
TIA
Andy (mod)
andy@ultranerds.co.uk
IMPORTANT: I've now moved to ultranerds.co.uk, and the .com will no longer work!
Want to give me something back for my help? Please see my Amazon Wish List
GLinks ULTRA Package (plugins total "value" $3,325 & rising, for just $350)| GLinks ULTRA Package PRO (plugins total "value" $5,625 & rising, for just $500)
Support Forum | Links SQL Plugins | DMOZ Dumps | UltraNerds | ULTRAGLobals Plugin | Pre-Made Template Sets | FREE GLinks Plugins!
Compare our different Plugin packages *new* Free CSS Templates
Code:
sub Test { use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
my $ua = new LWP::UserAgent;
my $url = 'http://www.test.com';
# Set up a callback that collect image links
my @urls = ();
my ($p, $res);
sub callback2 {
my($tag, %attr) = @_;
return if $tag ne 'a'; # we only look closer at <img ...>
push(@urls, values %attr);
}
# Make the parser. Unfortunately, we don't know the base yet
# (it might be diffent from $url)
$p = HTML::LinkExtor->new(\&callback2);
# Request document and parse it as it arrives
$res = $ua->request(HTTP::Request->new(GET => $url),
sub {$p->parse($_[0])}) || die $!;
# Expand all image URLs to absolute ones
my $base = $res->base;
@urls = map { $_ = url($_, $base)->abs; } @urls;
print $IN->header();
print join("<BR>",@urls);
}
Does anyone have a reasonably reliable method to grab URL's from a page?
TIA
Andy (mod)
andy@ultranerds.co.uk
IMPORTANT: I've now moved to ultranerds.co.uk, and the .com will no longer work!
Want to give me something back for my help? Please see my Amazon Wish List
GLinks ULTRA Package (plugins total "value" $3,325 & rising, for just $350)| GLinks ULTRA Package PRO (plugins total "value" $5,625 & rising, for just $500)
Support Forum | Links SQL Plugins | DMOZ Dumps | UltraNerds | ULTRAGLobals Plugin | Pre-Made Template Sets | FREE GLinks Plugins!
Compare our different Plugin packages *new* Free CSS Templates

