]*>\s*$@@is); s@\s*
\s+@@si || error ("unparsable entry (title) in $url"); my $title = $1; my $date = $2; $title = " $title"; # kludge to avoid re-posting everything: # stay compatible with the previous version. # delete some left over cruft... s@]*\bHREF=\"[a-z]+\.php[^<>\"]+\"[^<>]*>(.*?)@$1@gsi; my $body = $_; $title =~ s@<[^<>]*>@@g; # lose tags in title my $eurl = "$url\#$anchor"; unshift @sec, ($eurl, $date, $title, $body); } return @sec; } sub do_bondagefiles { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s/[\r\n]+/ /gs; s@^.*?(]*\bblacktri\.gif\b.*$@@is || error ("unable to trim tail in $url"); s@(]*\bHREF\b)@\n\001\001\001\n$1@gi; my @sec1 = split (/\n\001\001\001\n/s); my @sec2 = (); foreach (@sec1) { next if (m/^\s*$/s); s@^\s*]*?\bHREF=\"([^<>\"]+)\"[^<>]*>\s*(.*?)\s*\s*@@is || error ("unparsable entry (url) in $url"); my $eurl = $1; my $title = $2; my $date = ''; my $body = $_; $body =~ s@<[^<>]*>@@g; # lose tags in body push @sec2, ($eurl, $date, $title, $body); } return @sec2; } # generate a multi-entry RSS file with links to each picture, # but without inline images. # sub do_apod_link { my ($url, $html) = @_; $url =~ s@/[^/]*$@/@; # take off last path component $_ = $html; 1 while (s@@ @gsi); # lose comments s@^.*?\s*@@is || error ("unable to trim head in $url"); s@\s*
\s*@@si || error ("unable to trim head in $url"); s@?(p|br)>\s*(<[^<>]+>\s*)*Tomorrow\'s picture:.*?$@@ || error ("unable to trim tail in $url"); s@^(\s*<[^<>]+>\s*)+@@s; # lose leading tags s@^\s*((\d{4})[- ]([a-z]+)[- ](\d\d?))\b\s*(<(BR|P)>\s*)*@@i || error ("$url: unable to find date"); my $date = $1; my $year = $2; my $month = $3; my $dotm = $4; s@?(TR|TD|TABLE)\b[^<>]*>\s*@@gsi; # lose table tags my $body = $_; my %m = ( "Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4, "May" => 5, "Jun" => 6, "Jul" => 7, "Aug" => 8, "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12); $month =~ s/^(...).*/$1/; $month = $m{$month} || error ("unparsable month: $month"); my $eurl = $url . sprintf ("ap%02d%02d%02d.html", $year % 100, $month, $dotm); s@<(P|BR)\b[^<>]*>@\n@gsi; # expand newlines s@<[^<>]*>@@g; # lose other tags my ($title) = m@^\s*(.*?)\s*$@m; return ($eurl, $date, $title, $body); } sub do_redmeat { my ($url, $html) = @_; $url =~ s@/[^/]*$@/@; # take off last path component $_ = $html; 1 while (s@@ @gsi); # lose comments s@[\r\n]+@ @gs; s@^.*?(
@gsi); # compress P, BR # lose now-redundant A tags s@]*>\s*]*>@@gsi; # lose heading s@^\s*Merriam-Webster's Word of the Day\s*(
]*>\s*)?@@si; # compress spaces s@\s+@ @gs; my $body = "$_"; s@<[^<>]*>@@g; # lose tags m@the\s+word\s+(?:of|for)?\s+the\s+day\s+for\s+([a-z]+\s+\d+) (?:\s+is)?\s*:\s*([^\s]+)\b@ix || error ("unparsable entry in $url"); my $date = $1; my $title = $2; $title = "Word of the Day: $title"; return ($url, $date, $title, $body); } sub do_space_com { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments 1 while (s@]*>@@gsi); # lose javascript s@[\r\n]+@ @gs; s@(
]*> (.*?) | ]*> \s*
]*>\s*$@@gsi);
$_ = $body;
my $first_line;
m@^(.*?)\s*
\s*(.*)$@ || error ("no first line in entry in $url?");
$first_line = $1;
$body = $2;
$_ = $first_line;
s@?[BI]>@@gsi;
my ($posted_by, $date) =
m@^(.*? Posted\sBy .*? ) \s* \bon\b ( .* )$@xsi;
error ("unable to parse posted-by in $url") unless defined ($date);
$body = "$posted_by $body";
$_ = $body;
$title =~ s@<[^<>]*>@@g;
push @sec, ($eurl, $date, $title, $body);
}
return @sec;
}
sub do_dnasty {
my ($url, $html) = @_;
# a Blogger variant
$_ = $html;
1 while (s@@ @gsi); # lose comments
s@^.*?
]*\bID=\"bloggerBug\"[^<>]*>.*$@@is || error ("unable to trim tail in $url"); # lose the dates: there is not a date per entry. s@
]*>\s*$@@is); s@\s*
(.*?)
\s*@@i || error ("unparsable date in $url"); my $date = $1; s@\s*\b(CLASS|TARGET)=\"?[^\"<>]*\"?@@gsi; my $body = $_; $_ = $date; my ($anchor) = m@NAME=\"([^\"]+)\"@i; $anchor = undef; if (! $anchor) { # Oh, you morons. #error ("no anchor in $url?"); tr/A-Z/a-z/; s/[^a-z\d]/ /gi; s/\s+/ /gsi; s/^\s//gsi; s/\s$//gsi; s/^[^\s]+ ([^\s]+) ([^\s]+) .*$/$1$2/si; s/\s/_/gsi; $anchor = $_; } my $eurl = "$url\#$anchor"; $date =~ s/<[^<>]*>//gsi; $date =~ s/\#//gsi; $date =~ s/^\s+//gsi; $date =~ s/\s+$//gsi; $date =~ s/\s+/ /gsi; $body =~ s/^\s+//gsi; $body =~ s/\s+$//gsi; $body =~ s/\s+/ /gsi; # try for a title on the first line my $title = $body; $title =~ s@^(\s*?(P|BR)[^<>]*>)+\s*@@si; $title =~ s@?(P|BR)[^<>]*>.*$@@si; $title =~ s@.*$@@si; $title =~ s@<[^<>]*>@ @gs; $title =~ s@^(.{20}.*?([:;.?!]+|\s-+)).*@$1@si; $title =~ s@^\s+@@; $title =~ s@\s+$@@; # lose trailing P and /DIV 1 while ($body =~ s@\s*]*>\s*$@@is); $body =~ s@\s*
]*\b(CLASS|ID)=\"headmed\"[^<>]*>\s*@@is || error ("unable to trim head in $url"); s@\s*]*>.*?$@@is || error ("unable to trim tail in $url"); s@\s+@ @gsi; s@(\s*
]*\bCLASS=\"dateline\">\s*)@\n$1@gsi; my @sec1 = split (/\n/); my @sec2 = (); shift @sec1; foreach (@sec1) { next if (m/^\s*$/s); s@
(.*?)\s*(]*?\bHREF=\"([^<>\"]+)\"[^<>]*>\s*(.*?)\s*@@is || error ("unparsable entry (anchor) in $url"); my $eurl = $1; my $title = $2; my $body = $_; # lose leading/trailing P, BR, and DIV 1 while ($body =~ s@^\s*?(P|BR|DIV)\b[^<>]*>\s*@@is); 1 while ($body =~ s@\s*?(P|BR|DIV)\b[^<>]*>\s*$@@is); $date =~ s/<[^<>]*>/ /gsi; $date =~ s/\s+/ /gs; # his urls have ctimes in them - use that instead if ($eurl =~ m/\.(\d+)$/) { $date = strftime ("%a %b %e %H:%M:%S %Y %Z", localtime($1)); } push @sec2, ($eurl, $date, $title, $body); } return @sec2; } sub do_linkfilter { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s@ @ @gs; s@\s+@ @gsi; s@\s*(
s@(?P\b[^<>]*>\s*)+@
@gsi;
my ($date) = m@submitted(.*?)
@si;
s@^Link\b(.*?)
\s*@@gsi || error ("no date line in $url");
$date =~ s@^.* on @@;
$date =~ s@\s*\.?\s*\(.*$@@;
s@\s+\bONCLICK=\"[^\"]+\"@@gsi;
# you chumps. undo link-tracking BS.
s@]*>\s*(http:[^<>\"]+)\s*\s*@@si;
my $turl = $1;
my $oturl = $turl;
error ("no url found in $url") unless defined ($turl);
# FUCK! assholes! we have to use their redirector if the link was
# long, because they truncate it.
if ($turl =~ m/\.\.\.$/) {
$turl = "$eurl;cmd=go";
}
# lose trailing Comments links
s@]*>\s*Comments\b.*$@@si;
# lose trailing P, BR, and DIV
1 while (s@\s*?(P|BR|DIV)\b[^<>]*>\s*$@@is);
my $body = $_;
$body = "$oturl
$body"; push @sec2, ($eurl, $date, $title, $body); } return @sec2; } sub do_creaturesinmyhead { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s@ @ @gs; s@\s+@ @gsi; my ($eurl, $img, $title) = m@\"]+)\"[^<>]*> \s* (]*>) \s* (.*?)
" .
"$img
$title
]*\bCLASS=\"(topheadline|secondheadline)\"[^<>]*>\s*@\n@gsi;
s@(<(A)\b[^<>]*\bCLASS=\"(linklink)\"[^<>]*>\s*) @ \n $1 @gsix;
my @body_sec = ();
my @link_sec = ();
my @sec1 = split (/\n/);
shift @sec1;
foreach (@sec1) {
s@?(TD|TR|IMG|FONT|DIV|SPAN)\b[^<>]*>@ @gsi;
next if (m/^\s*$/);
my $headp = (m/CLASS=\"[^\"<>]*?headline\"/i ? 1 : 0);
s@\s*\b(CLASS|TARGET)=\"[^\"<>]*\"@@gsi;
s@]*>\s*\s*@@gsi;
s@\s+@ @gsi;
s@^\s* ]*>\s*@@gsi; s@^\s*]*?\bHREF=\"([^<>\"]+)\"[^<>]*>\s*(.*?)\s*@@is || error ("unparsable entry (anchor) in $url"); my $eurl = $1; my $title = $2; my ($date) = m@\s(\d\d?\.\d\d\.\d\d)\s*@si; error ("no date line in $url") if ($headp && !defined ($date)); $date = '' unless defined($date); # lose byline s@^\s*--.*?\d\d?\.\d\d\.\d\d\s*@@gsi; # lose trailing P, BR, and DIV 1 while (s@\s*?(P|BR|DIV)\b[^<>]*>\s*$@@is); my $body = $_; if ($headp) { $title = capitalize ($title); } else { $_ = $title; m@^\s*(.*?)\s*(.*)$@si || error ("unable to extract title in $url"); $title = $1; $body = $2; } $title =~ s@<[^<>]*>@ @sg; $title =~ s@\s+@ @sg; $body =~ s@^\s* \s*@@si;
$body = "$title $body";
$title =~ s@[\s.,:;]+$@@sg;
if ($headp) {
push @body_sec, ($eurl, $date, $title, $body)
if ($#body_sec+1 < $max_links_each * 4);
} else {
push @link_sec, ($eurl, $date, $title, $body)
if ($#link_sec+1 < $max_links_each * 4);
}
}
return (@body_sec, @link_sec);
}
sub do_artbomb {
my ($url, $html) = @_;
# a Blogger variant
$_ = $html;
1 while (s@@ @gsi); # lose comments
# lose the dates: there is not a date per entry.
s@]*>.*?\s*@@gsi;
s@ @ @gsi;
s@\s+@ @gsi;
s@(\s*( \s*)? @gsi); s@?(TD|TR|FONT|DIV|SPAN)\b[^<>]*>@ @gsi; s@\s*\b(CLASS|TARGET)=\"?[^\"<>]*\"?@@gsi; # lose trailing P, DIV, HR 1 while (s@\s*?(P|DIV|HR)\b[^<>]*>\s*$@@is); my $date = ''; # This blog is fucked up, because it has only one anchor for multiple # entries per day! WTF, man. So let's split it and manufacture # different URLs. # my $count = 0; my @sec3 = split (/ \s*/i); foreach (@sec3) { my $body = $_; # try for a title on the first line my ($title) = m@\s*(.*?)\s*@si; ($title) = m@]*>\s*(.*?)\s*@si unless defined ($title); $title = '' unless defined ($title); $title =~ s@^(.{20}.*?([:;.?!]+|\s-+)).*@$1@si; $title =~ s@^\s+@@; $title =~ s@[\s:;.,]*$@@; my $ii = $#sec3 - $count; my $eurl = $url . ($ii == 0 ? "" : "?$ii") . "#" . $anchor; push @sec2, ($eurl, $date, $title, $body); $count++; } } return @sec2; } sub do_leonard { my ($url, $html) = @_; $_ = $html; 1 while (s@@ @gsi); # lose comments s/[\r\n]+/\n/gs; #s@(]*?\bCLASS=\"title\"[^<>]*>\s*(.*?)\s*\s*@is # Date becomes part of the title if ( m@ ]*?\bCLASS=\"date\"[^<>]*>\s*(.*?)\s* @is )
{
if ( $inPost )
{
$title = "$date: $time";
#$title =~ s@<[^<>]+>@@gs; # lose tags in title
$body =~ s@@@gs; # lose anchors
#$body =~ s@<[^<>p]+>@ @gs; # lose tags in body
push @sec, ($eurl, $date, $title, $body);
}
$inPost = 0;
$date = $1;
}
# Grab the time of the post
elsif ( m@]*?\bCLASS=\"time\"[^<>]*>\s*(.*?)\s* @is )
{
if ( $inPost )
{
$title = "$date: $time";
#$title =~ s@<[^<>]+>@@gs; # lose tags in title
$body =~ s@@@gs; # lose anchors
#$body =~ s@<[^<>p]+>@ @gs; # lose tags in body
push @sec, ($eurl, $date, $title, $body);
}
$time = $1;
$eurl = "";
$title = "";
$body = "";
$inPost = 1;
}
# Grab permalink
elsif ( m@]*?\bHREF=\"([^\"]+journal[^\"]+)\">@is )
{
$eurl = $1;
}
# Stuff the content in the body
elsif ( $inPost )
{
$body .= $_;
}
}
return @sec;
}
sub do_radiocanada {
my ($url, $html) = @_;
$_ = $html;
s@^\s*<\?xml[^<>]*>\s*@@is || error ("unable to trim head in $url");
s@^\s*Cheesegrater RSS FeedsCheesegrater is a "screen scraper" which creates RSS feeds for sites that don't have one. The software has been modified by Evan Jones, and the source code is available. This is a list of the RSS feeds available, feel free to subscribe to them or to request new ones. Be warned that this script will not hit a site more frequently than once every six hours, and sometimes even less than that. Overall: requests: $overallStats[0] fetches: $overallStats[1] 200s: $overallStats[2] 304s: $overallStats[3] gzips: $overallStats[4] EOF foreach my $url ( keys %filter_table ) { my ($fn, $title, $desc, $rss_img, $rss_img_w, $rss_img_h, $expirey) = get_filter_data ($url); $title =~ s/&/&/g; # de-HTMLify $url =~ s/&/&/g; # de-HTMLify my @currentStats = ( 0, 0, 0, 0, 0 ); @currentStats = @{$stats{$url}} if exists $stats{$url}; print qq{requests: $currentStats[0] fetches: $currentStats[1] 200s: $currentStats[2] 304s: $currentStats[3] gzips: $currentStats[4] |