#! /usr/bin/perl -w
# NAME
# fetch.pl - RSS/Atom tiny aggregator
#
# SYNOPSIS
#
# fetch.pl [config.conf]
#
# DESCRIPTION
#
# Fetch RSS/Atom feeds read from config file, and generate HTML.
# If no argument is supplied, config is read from "fetch.conf".
# HTML file name is filename minus its extension: so config file
# "fetch.conf" generates "fetch.html".
#
# Change '$base' to set another HTML directory, and '$max_entries'
# to set the maximum number of entries shown per feed.
#
# Mandatory perl modules are:
# - Encode
# - LWP::Simple
# - XML::RSS::Feed
# - XML::Atom
#
# AUTHOR
# oz (@tuxaco dot net)
# http://oz.tuxaco.net/
use strict;
use warnings;
use Encode;
use LWP::Simple qw ( $ua );
use XML::RSS::Feed;
use XML::Atom::Entry;
# Base conf settings you may change
my $base = '/home/oz/whatever';
my $agent = 'Mozilla 5.0';
my $conf = shift || 'fetch.conf' || die("I need a conf. file.\n");
my $max_entries = 12;
my $encoding = 'utf8';
# -----------------------------------------------------------------------------
my $name = $conf;
$name =~ s/^(.*\/)?(.*)\.conf$/$2/g;
my $files = read_conf($conf);
my $output = "$base/$name.html";
my $now = scalar localtime();
my $out = '';
defined $files or die("No feeds ? $!");
# Begin HMTL file
add(<<"HTML");
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<html>
<head>
<title>My tiny aggregator</title>
<link rel="stylesheet" href="style.css" media="screen" name="default" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
</head>
<body>
<div id="itemlist">
<div id="lastupdate">
Reading: $name<br />
Last update: $now
</div>
HTML
status("Start parsing: \n");
$ua->agent($agent);
# Each feed in config file is fetched and parsed
foreach my $feed ( @$files ) {
my $item_count = 0;
my $file = $feed->{'url'};
my $data = undef;
$file =~ s|.*/||;
status(" * $feed->{'name'}... ");
# Get feed
my $query = $ua->get($feed->{'url'});
if ($query->is_success) {
$data = $query->content;
} else {
status("Failed (HTTP error).\n");
next;
}
# Parse feed
my $channel = ($data =~ /<(rss|rdf)/imo) ? rss($data) : atom($data);
unless ($channel) {
status("Failed parsing.\n");
next;
};
# Print feed
my $desc = encode( $encoding, $channel->{'desc'} );
my $title = encode( $encoding, $channel->{'title'} );
add(<<"HTML");
<div class="item">
HTML
add(qq| <h1><a href="$channel->{'link'}" title="$feed->{'name'}">|);
add(qq|$title</a></h1>\n|);
add(qq| <p>$desc</p>\n|);
add(" <ul>\n");
# Print feed entries
foreach my $item ( @{ $channel->{'entries'} } )
{
my $title = encode(
$encoding,
defined ($item->{'title'}) ? $item->{'title'} : $feed->{'name'}
);
add(qq| <li><a href="$item->{'link'}">$title</a></li>\n|);
$item_count++;
last if ($item_count >= $max_entries);
}
add(<<HTML);
</ul>
</div>
HTML
status("Ok.\n");
}
# End HTML
add(<<HTML);
</div>
</body>
</html>
HTML
umask(0022) or die("Can't change file creation mask: $!");
open (OUT, "> $output") or die("Output file ($output): $!");
print OUT $out;
close OUT;
status("\nAll done, bye.\n");
# -----------------------------------------------------------------------------
# Returns entry list for an RSS feed (in $data);
sub rss
{
my ($data) = @_;
# ugly hack 'cause lib-expat often misses the 15 encoding while
# the first version remains compatible in 99.99% cases.
if ( $data =~ /encoding="iso-8859-15"\?>/io ) {
$data =~ s/encoding="iso-8859-15"/encoding="iso-8859-1"/i;
}
my $rss = XML::RSS->new();
status(" [R] ");
eval {
$rss->parse($data);
} or return undef;
return { 'link' => $rss->{'channel'}->{'link'},
'title' => $rss->{'channel'}->{'title'},
'desc' => $rss->{'channel'}->{'description'},
'entries' => $rss->{'items'},
};
}
# Returns entry list for an Atom feed (in $data);
sub atom
{
my ($data) = @_;
my $feed;
my @my_entries;
status(" [A] ");
eval {
$feed = XML::Atom::Feed->new(\$data);
} or return undef;
return undef unless $feed;
map { push @my_entries, {
'title' => $_->title ? $_->title : 'No title ?',
'link' => atom_html_link($_),
}
} $feed->entries;
return {
'title' => $feed->title ? $feed->title : 'No title ?',
'desc' => $feed->tagline ? $feed->tagline : 'No description',
'link' => atom_html_link($feed),
'entries' => \@my_entries,
};
}
# Get first text/html link from an XML::Atom::Entry link list.
sub atom_html_link
{
my ($link) = @_;
my @linklist = $link->link;
foreach my $link (@linklist) {
next if ($link->type ne 'text/html'
or $link->rel ne 'alternate');
return $link->href;
}
return '';
}
# Read config.
sub read_conf
{
my ($file) = @_;
my @items = ();
open (CONF, $file) or die("Can't open '$file': $!.\n");
while (my $line = <CONF>) {
chomp $line;
next if $line =~ /^\s*#|^\s*$/;
my $feed;
if ( $line =~ /^(.*?)\s*=\s*(.*)$/ ) {
$feed->{'name'} = $1;
$feed->{'url'} = $2;
}
push @items, $feed;
}
close CONF;
return \@items;
}
# Append args to $out
sub add
{
$out .= shift;
}
# Print status message on standard error
sub status
{
my ($str) = @_;
return unless $str;
print STDERR $str;
}