use 5.010; use strict; use warnings; use URI; use XML::LibXML; # Global data my $feed_url = URI->new('http://www.dlib.org/rss/dlib.rss'); my %namespaces = ( rss => 'http://purl.org/rss/1.0/', rdf => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' ); # XML/HTML parser setup my $parser = XML::LibXML->new(); $parser->recover_silently(1); # XPath setup my $xpath = XML::LibXML::XPathContext->new(); while (my ($prefix, $url) = each %namespaces) { $xpath->registerNs($prefix, $url); } # Load feed my $feed = $parser->parse_file("$feed_url") or die 'Could not load feed'; # Find all items foreach my $item ($xpath->findnodes('/rdf:RDF/rss:item', $feed)) { my ($title, $description) = map { $xpath->findvalue("./$_/text()", $item); } qw/rss:title rss:description/; if (defined($title) && defined($description)) { say '#' x 72; say "Title: $title"; say "Description: $description"; # Determine URL of page relative to feed my $page_url = URI->new_abs($xpath->findvalue('./rss:link/text()', $item), $feed_url); if (defined($page_url)) { say "Page: $page_url"; # Load page my $page = $parser->parse_html_file("$page_url") or die "Could not load page: $page_url"; # Find all links on the page foreach my $anchor ($xpath->findnodes('//a[@href]', $page)) { my $link = URI->new_abs($anchor->getAttribute('href'), $page_url); say "Link: $link" if (defined($link)); } } } }