#!/usr/bin/perl
#
# HT://DIGALIZER
#
# Author:
#   Nathan Hand
#
# History:
#   2001 June 4th - First Release, No Version Number
#
# Usage:
#   cat /var/log/htsearch.log | htdigalizer | webalizer -
#
# Description:
#   Converts htsearch logs (enabled in htdig.conf with logging:yes) into
#   Extended Common Log Format. Loses lots of information in the process
#   and CLF isn't an appropriate log format. Useful because there are no
#   htsearch specific graphing utilities, so with this script you'll get
#   something up quickly using webalizer.
#
# Bugs:
#   The labels generated by webalizer will be misleading.
#   The graphs and tables are mostly inappropriate.
#
# Future:
#   This program has no future! It's a stopgap until someone (preferably
#   not me) writes a proper htsearch specific graphing utility. 
#

use Time::Local;
use POSIX qw(strftime);

sub parse_syslog {
    my($logline) = @_;

    %months = ("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec");
    $year = (localtime(time))[5];

    if ($logline =~ /(\S+)\s+(\d+)\s+(\d\d):(\d\d):(\d\d)\s+(\S+)\s+([^:]+):\s(.*)/) {
        $month =   $months{$1};
        $day =     $2;
        $hour =    $3;
        $min =     $4;
        $sec =     $5;
        $host =    $6;
        $process = $7;
        $htlog =   $8;

        $time = timelocal($sec, $min, $hour, $day, $month, $year); 
        return ($time, $host, $process, $htlog);
    }
}

sub parse_htlog {
    my($logline) = @_;

    if ($logline =~ /(\S+) \[([^\]]*)\] \(([^\)]*)\) \[([^\]]*)\] \[([^\]]*)\] \((\d+)\/(\d+)\) - (\d+) -- (.*)/) {
        $remote   = $1;
        $config   = $2;
        $method   = $3;
        $words    = $4;
        $logic    = $5;
        $matches  = $6;
        $perpage  = $7;
        $curpage  = $8;
        $referrer = $9;

        return ($remote, $config, $method, $words, $logic, $matches, $perpage, $curpage, $referrer);
    }
}

sub buildclf {
    my($time, $host, $remote, $words, $matches, $referrer) = @_;

    $words =~ s/([\"\%\\])/sprintf "%%%x", ord($1)/eg;
    $referrer =~ s/([\"\\])/sprintf "%%%x", ord($1)/eg;

    $datetime = strftime "[%d/%b/%Y:%H:%M:%S %z]", gmtime($time);
    $status = ($matches == 0) ? "404" : "200";
    $rfc931 = "-";
    $authuser = "-";
    $request = "\"" . "GET $words" . "\"";
    $referrer = "\"" . $referrer . "\"";
    $useragent = "\"" . "ht://dig from $host" . "\"";

    return "$remote $rfc931 $authuser $datetime $request $status $matches $referrer $useragent";
}

while (<STDIN>) {
    ($time, $host, $process, $htlog) = &parse_syslog($_);
    ($remote, $config, $method, $words, $logic, $matches, $perpage, $curpage, $referrer) = &parse_htlog($htlog);
    print &buildclf($time, $host, $remote, $words, $matches, $referrer) . "\n";
}

