#! /usr/bin/perl # Copyright (c) 2007,8 Joey Schulze # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc.,59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. use strict; use warnings; use Getopt::Long; use LWP::UserAgent; use DBI; use Carp; my $dsn = 'dbi:Pg:dbname=web'; my $dbh; my $config = { mailto => 'joey@infodrom.org', prefix => 'ref_', }; my $verbose = 0; my $quiet = 0; my $site = undef; my $logfile = undef; my %options = ("verbose" => \$verbose, "site=s" => \$site, "logfile=s" => \$logfile, "quiet" => \$quiet, ); =pod CREATE TABLE ref_site ( id SERIAL, name TEXT NOT NULL ); CREATE UNIQUE INDEX ref_site_id ON ref_site (id); CREATE UNIQUE INDEX ref_site_name ON ref_site (name); CREATE TABLE ref_url ( id SERIAL, site INT REFERENCES ref_site (id) ON DELETE CASCADE ON UPDATE CASCADE, path TEXT NOT NULL ); CREATE UNIQUE INDEX ref_url_id ON ref_url (id); CREATE TABLE ref_origin ( id SERIAL, url TEXT NOT NULL, title TEXT DEFAULT '' NOT NULL ); CREATE UNIQUE INDEX ref_origin_id ON ref_origin (id); CREATE TABLE ref_log ( url INT REFERENCES ref_url (id) ON DELETE CASCADE ON UPDATE CASCADE, origin INT REFERENCES ref_origin (id) ON DELETE CASCADE ON UPDATE CASCADE, occurrence TIMESTAMP WITH TIME ZONE NOT NULL ) WITH OIDS; CREATE INDEX ref_log_url ON ref_log (url); CREATE INDEX ref_log_origin ON ref_log (origin); CREATE INDEX ref_log_occurrence ON ref_log (occurrence); DROP TABLE ref_log; DROP TABLE ref_origin; DROP TABLE ref_url; DROP TABLE ref_site; =cut sub fetch_title { my $url = shift; if ($url =~ m{http://www\.google\.}) { my ($base, $rest) = split (/\?/, $url); foreach my $part (sort (split(/\&/, (split (/\?/, $url))[1]))) { if ($part =~ /^q=(.*)$/) { my $search = $1; $search =~ tr/+/ /; utf8::decode $search; return 'Google: ' . $search; } } } my $ua = new LWP::UserAgent; $ua->timeout (20); $ua->env_proxy; my $request = new HTTP::Request ('GET', $url); my $response = $ua->request ($request); if ($response->is_success) { if ($response->content =~ m{(.*)}mi) { my $title = $1; utf8::decode $title; return $title; } } } my %sitehash; sub get_site_id { my $site = shift; return $sitehash{$site} if exists $sitehash{$site}; my $query = sprintf ("SELECT id FROM %ssite WHERE name = '%s'", $config->{prefix}, $site); my $sth = $dbh->prepare ($query); $sth->execute; if ($sth->rows) { my $row = $sth->fetchrow_hashref; $sth->finish; $sitehash{$site} = $row->{id}; return $row->{id}; } $query = sprintf ("INSERT INTO %ssite (name) VALUES (?)", $config->{prefix}); $sth = $dbh->prepare ($query); $sth->execute ($site); $query = sprintf ("SELECT currval('%ssite_id_seq') AS id", $config->{prefix}); $sth = $dbh->prepare ($query); $sth->execute; if ($sth->rows) { my $row = $sth->fetchrow_hashref; $sth->finish; $sitehash{$site} = $row->{id}; return $row->{id}; } } my %urlhash; sub get_url_id { my $site = shift; my $path = shift; return $urlhash{$site}{$path} if exists $urlhash{$site}{$path}; my $query = sprintf ("SELECT id FROM %surl WHERE site = %d AND path = %s", $config->{prefix}, $site, $dbh->quote($path)); my $sth = $dbh->prepare ($query); $sth->execute; if ($sth->rows) { my $row = $sth->fetchrow_hashref; $sth->finish; $urlhash{$site}{$path} = $row->{id}; return $row->{id}; } $query = sprintf ("INSERT INTO %surl (site,path) VALUES (?,?)", $config->{prefix}); $sth = $dbh->prepare ($query); $sth->execute ($site, $path); $query = sprintf ("SELECT currval('%surl_id_seq') AS id", $config->{prefix}); $sth = $dbh->prepare ($query); $sth->execute; if ($sth->rows) { my $row = $sth->fetchrow_hashref; $sth->finish; $urlhash{$site}{$path} = $row->{id}; return $row->{id}; } } my %originhash; sub get_origin_id { my $url = shift; return $originhash{$url} if exists $originhash{$url}; my $query = sprintf ("SELECT id FROM %sorigin WHERE url = %s", $config->{prefix}, $dbh->quote($url)); my $sth = $dbh->prepare ($query); $sth->execute; if ($sth->rows) { my $row = $sth->fetchrow_hashref; $sth->finish; $originhash{$url} = $row->{id}; return $row->{id}; } my $title = fetch_title $url; $query = sprintf ("INSERT INTO %sorigin (url,title) VALUES (?,?)", $config->{prefix}); $sth = $dbh->prepare ($query); $sth->execute ($url, $title); $query = sprintf ("SELECT currval('%sorigin_id_seq') AS id", $config->{prefix}); $sth = $dbh->prepare ($query); $sth->execute; if ($sth->rows) { my $row = $sth->fetchrow_hashref; $sth->finish; $originhash{$url} = $row->{id}; return $row->{id}; } } sub dbwrite { my ($site, $timestamp, $request, $referer) = @_; $dbh = DBI->connect ($dsn, '', '', {'PrintError' => 1}) unless defined $dbh; $dbh->begin_work; my $id_site = get_site_id ($site); return $dbh->rollback unless defined $id_site; my $id_origin = get_origin_id ($referer); return $dbh->rollback unless defined $id_origin; my $id_url = get_url_id ($id_site, $request); return $dbh->rollback unless defined $id_url; my $query = qq{INSERT INTO $config->{prefix}log (url,origin,occurrence) VALUES (?,?,?)}; my $sth = $dbh->prepare ($query); $sth->execute ($id_url, $id_origin, $timestamp); $dbh->commit; } sub parselog { my $sitename = shift; my $logfile = shift; my ($host, $user, $date, $dummy, $type, $request, $code, $size, $referer); my $line; if (open (my $log, $logfile)) { while (<$log>) { chomp; if (($host, $user, $date, $dummy, $type, $request, $code, $size, $referer) = $_ =~ m,^([^\s]+)\s+-\s+([^\[]+)\s+\[([^\]]+)\]\s+"((\S+)\s(.*)\sHTTP/[^"]*)"\s+(\d+)\s+(\d+|-)\s+"([^"]*)",) { next unless $type eq 'GET'; next unless $code eq '200'; next if $referer eq '-' || $referer eq ''; next if $referer =~ m,http://$site/,; if ($referer =~ m{http://www\.google\.}) { my ($base, $rest) = split (/\?/, $referer); next unless defined $rest; $referer = $base.'?'; foreach my $part (sort split(/\&/, $rest)) { $referer .= $part.'&' if ($part =~ /^(q|start|meta)=./); } chop $referer; } $referer =~ s/\%([A-Fa-f0-9]{2})/pack('C', hex($1))/seg; utf8::decode $referer; # printf "%s: %s from %s\n", $date, $request, $referer; # printf "%s\n", fetch_title $referer; dbwrite $sitename, $date, $request, $referer; } else { carp "Cannot parse line '$_' in $logfile" unless ($quiet); next; } } close $log; } } GetOptions (%options); die "No site or logfile given" unless defined $site && defined $logfile; my $refs = parselog $site, $logfile;