# ------------------------------------------------------------------------
# HtDig.pm: Perl Module for interface to Ht:/Dig databases & stuff.
# Ht:/Dig is a free Intranet search engine http://www.htdig.org/
# Version: 0.1b
# Author: heddy Boubaker <boubaker@cenatoulouse.dgac.fr>
# Revisions:
#  05 Feb 1999 BirthDate
# ------------------------------------------------------------------------

#use diagnostics -verbose;
#use strict;
require 5.00502;
use Carp;
use BerkeleyDB;

package HtDig;

# Configurable vars
$home = '/www/Tools/Htdig';


# ------------------------------------------------------------------------

package HtDig::Config;
# Interface to the htdig config

# Configurable vars
$config_dir = $HtDig::home . '/conf';

# Some default values
$common_dir = $HtDig::home . '/common';
$create_images_list = 0;
$create_url_list = 0;
$database_dir = $HtDig::home . '/db';
$database_base = $database_dir . '/db';
$doc_db = $database_base . '.docdb';
$doc_index = $database_base . '.docs.index';
$doc_list = $database_base . '.docs';
$image_list = $database_base . '.images';
$server_aliases = '';
$start_url = 'http://www/';
$synonym_dictionary = $common_dir . '/synonyms';
$synonym_db = $common_dir . '/synonyms.db';
$url_list = $database_base . '.urls';
$word_db = $database_base . '.words.db';
$word_list = $database_base . '.wordlist';

# The configuration
%config =
    (
     'common_dir' => $common_dir,
     'config_dir' => $config_dir,
     'create_url_list' => $create_url_list,
     'create_images_list' => $create_images_list,
     'database_dir' => $database_dir,
     'database_base' => $database_base,
     'doc_db' => $doc_db,
     'doc_index' => $doc_index,
     'doc_list' => $doc_list,
     'image_list' => $image_list,
     'server_aliases' => $server_aliases,
     'start_url' => $start_url,
     'synonym_dictionary' => $synonym_dictionary,
     'synonym_db' => $synonym_db,
     'url_list' => $url_list,
     'word_db' => $word_db,
     'word_list' => $word_list,
     );


###
# parse( );
# TODO
###
#sub parse ( $ ) {
#    my $configfile = shift;
#} # end parse();



# ------------------------------------------------------------------------

package HtDig::DocDB;
# Interface to the docdb database


# Configurable vars
$db_file = $HtDig::Config::config{'doc_db'};

# The current record that could be accessed in
# $func prom within process()
%record = {};

###
# process( );
###
sub process ( &$ ) {
    my ( $func, $file ) = @_;
    my ( %db, $key, $value );
    tie( %db, 'BerkeleyDB::Btree', -Filename => $file, -Flags => DB_RDONLY ) ||
        die "Error: $file - $!";
    while (( $key, $value ) = each %db ) {
        next if $key =~ /^nextDocID/;
        &_parse_value( $key, $value );
        #print "Eval w/ $key\n";
        eval &$func;
    }
} # end process();


sub _parse_value ($$) {
    my ( $key, $value ) = @_;
    my ( $length, $count, $result, $what );
    # reset record
    $record = {};
    while ( length( $value ) > 0 ) {

$what  = unpack("C", $value);

$value = substr($value, 1);

if ($what == 0)

{

    # ID

    $record{"ID"} = unpack("i", $value);

    $value = substr($value, 4);

}

elsif ($what == 1)

{

    # TIME

    $record{"TIME"} = unpack("i", $value);

    $value = substr($value, 4);

}

elsif ($what == 2)

{

    # ACCESSED

    $record{"ACCESSED"} = unpack("i", $value);

    $value = substr($value, 4);

}

elsif ($what == 3)

{

    # STATE

    $record{"STATE"} = unpack("i", $value);

    $value = substr($value, 4);

}

elsif ($what == 4)

{

    # SIZE

    $record{"SIZE"} = unpack("i", $value);

    $value = substr($value, 4);

}

elsif ($what == 5)

{

    # LINKS

    $record{"LINKS"} = unpack("i", $value);

    $value = substr($value, 4);

}

elsif ($what == 6)

{

    # IMAGESIZE

    $record{"IMAGESIZE"} = unpack("i", $value);

    $value = substr($value, 4);

}

elsif ($what == 7)

{

    # HOPCOUNT

    $record{"HOPCOUNT"} = unpack("i", $value);

    $value = substr($value, 4);

}

elsif ($what == 8)

{

    # URL

    $length = unpack("i", $value);

    $record{"URL"} = unpack("x4 A$length", $value);

    $value = substr($value, 4 + $length);

}

elsif ($what == 9)

{

    # HEAD

    $length = unpack("i", $value);

    $record{"HEAD"} = unpack("x4 A$length", $value);

    $value = substr($value, 4 + $length);

}

elsif ($what == 10)

{

    # TITLE

    $length = unpack("i", $value);

    $record{"TITLE"} = unpack("x4 A$length", $value);

    $value = substr($value, 4 + $length);

}

elsif ($what == 11)

{

    # DESCRIPTIONS

    $count = unpack("i", $value);

    $value = substr($value, 4);

    $result = "";

    foreach (1 .. $count)

    {


$length = unpack("i", $value);


$result = $result . unpack("x4 A$length", $value) . " ";


$value = substr($value, 4 + $length);

    }

    chop $result;

    $record{"DESCRIPTIONS"} = $result;

}

elsif ($what == 12)

{

    # ANCHORS

    $count = unpack("i", $value);

    $value = substr($value, 4);

    $result = "";

    foreach (1 .. $count)

    {


$length = unpack("i", $value);


$result = $result . unpack("x4 A$length", $value) . " ";


$value = substr($value, 4 + $length);

    }

    chop $result;

    $record{"ANCHORS"} = $result;

}

elsif ($what == 13)

{

    # EMAIL

    $length = unpack("i", $value);

    $record{"EMAIL"} = unpack("x4 A$length", $value);

    $value = substr($value, 4 + $length);

}

elsif ($what == 14)

{

    # NOTIFICATION

    $length = unpack("i", $value);

    $record{"NOTIFICATION"} = unpack("x4 A$length", $value);

    $value = substr($value, 4 + $length);

}

elsif ($what == 15)

{

    # SUBJECT

    $length = unpack("i", $value);

    $record{"SUBJECT"} = unpack("x4 A$length", $value);

    $value = substr($value, 4 + $length);

}

elsif ($what == 16)

{

    # STRING (ignore, but unpack)

    $length = unpack("i", $value);

    $record{"STRING"} = unpack("x4 A$length", $value);

    $value = substr($value, 4 + $length);

}

elsif ($what == 17)

{

    # METADSC

    $length = unpack("i", $value);

    $record{"METADSC"} = unpack("x4 A$length", $value);

    $value = substr($value, 4 + $length);

}

elsif ($what == 18)

{

    # BACKLINKS

    $record{"BACKLINKS"} = unpack("i", $value);

    $value = substr($value, 4);

}

elsif ($what == 19)

{

    # SIGNATURE

    $record{"SIG"} = unpack("i", $value);

    $value = substr($value, 4);

}
    }
}


# ------------------------------------------------------------------------

package HtDig::WordDB;
# Interface to the worddb database

# Configurable vars
$db_file = $HtDig::Config::config{'word_db'};

# The current record that could be accessed in
# $func prom within process()
%record = {};

###
# process( );
###
sub process ( &$ ) {
    my ( $func, $file ) = @_;
    my ( %db, $key, $value );
    tie( %db, 'BerkeleyDB::Btree', -Filename => $file, -Flags => DB_RDONLY ) ||
        die "Error: $file - $!";
    while (( $key, $value ) = each %db ) {
        #next if $key =~ /^nextDocID/;
        &_parse_value( $key, $value );
        #print "Eval w/ $key\n";
        eval &$func;
    }
} # end process();


sub _parse_value ($$) {
    my ( $key, $value ) = @_;
    my ( $length, $count, $id, $weight, $anchor, $location );
    # reset record
    $record = {};
    $record{'WORD'}  = $key;
    # extracted from wordfreq.pl
    $length = length( $value ) / 20;
    $record{'TOTAL'} = 0;
    $record{'NDOCS'} = 0;
    $record{'DOCS'}  = [];
    foreach $i ( 0 .. $length - 1 ) {

($count, $id, $weight, $anchor, $location ) =

    unpack("i i i i i", substr( $value, $i * 20, 20 ));
        $record{'TOTAL'} += $count;
        $record{'NDOCS'}++;
        $record{'DOCS'}[$i] = { 'ID' => $id,
                                'WEIGHT' => $weight,
                                'ANCHOR' => $anchor,
                                'LOCATION' => $location
                                };
    }
}


# In case of required...
1;
# HtDig.pm ends here  ------------------------------------------------


