#!/usr/bin/perl # Copyright (C) 2006 David M. Turner # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Boston, MA 02110 USA use strict; my $start_of_post = qr{}sm; my $start_of_date = qr{Old}sm; my $date = qr{(\d\d-\d\d-\d{4}, \d\d:\d\d (?:AM|PM))}sm; my $person = qr{(.*?)}sm; #this one is only on the first post, looks like my $title = qr{\W*
\W*(.*?)}sm; sub degooglize { #removes highlights added by Google cache my $str = shift; $str =~ s{([^<]+)}{$1}g; #} return $str; } my $body = qr{
(.*?)
(?:\W+)}sm; my $page = join ("", <>); open my $output, ">>output.txt"; my $post_title = undef; my $post_id = -1; my $post_user_name; my $post_user_id; my $post_body; my $post_date; my $post_count = 0; while (1) { if ($page !~ /$start_of_post/g) { print "Recovered $post_count posts.\n"; last; } $post_id = $1; $post_count += 1; if ($page !~ /$start_of_date/g) { print "PARSER ERROR: couldn't get date start in post $post_id\n"; } if ($page !~ /$date/g) { print "PARSER ERROR: couldn't get date itself in post $post_id\n"; } $post_date = $1; if ($page !~ /$person/g) { print "PARSER ERROR: couldn't get poster in post $post_id\n"; } $post_user_id = $1; $post_user_name = $2; #it's possible that the poster has an avatar. If so, get the name from it. if ($post_user_name =~ /^