#!/usr/local/bin/perl
#
#  report.missing.pages.pl  v1.0  Feb 23 1999  Sinclair Budd ( s.budd@ic.ac.uk)
#
#   Produces a report of the missing pages and mails to the webmaster
#   of the server whose page has a link to the missing page.
#   
#
#
#   Usage: report.missing.pages.pl   logfile  reportfile     webmastersfile
#
#        (IN)   logfile        full path to file which a STDOUT from a htdig run has been directed.
#                              eg  from a command like     htdig > /tmp/log
#
#        (OUT)  reportfile     full path to a file where the program writes the information
#                              about  missing pages for which no webmaster is known.  
#
#        (IN)   webmasterfile  the full path to a file which contains the webmaster's 
#                              email addresses, one  for each of the servers.
#                              The webmasters appear one per line, the first entry is the
#                              IP name of the web server and the second entry separated by
#                              a space, is the email address of the servers webmaster.
#                              e.g.    www.test.com fred.bloggs@test.com
#     
#         (OUT)  STDOUT         a list of all missing web pages and the pages which reference them.
#
# -------------------------------------------------------------------------------------
#   
#  Open files and initalize variables.
#.................................
# 
#  Read in the webmasters and store in associative array
#  
    open( MASTERS ,@ARGV[2])  ||  die " Can't open webmasters file  ,@ARGV[2] "  ;
    while ( $line = <MASTERS> )
      {  chomp $line  ;
         ($serv,$mast ) = split(" ",$line);
         $masters{$serv} = $mast ;
      }
         print "Webmaster  List \n" ;
         foreach  $in (  keys %masters )
         {  print $in ," ",  $masters{$in} ,"\n" ; }

#  Open the htdig  STDOUT  file
#
    open(LOG, @ARGV[0] ) ||  die " Can't open the htdig log file   @ARGV[0] " ;
    $i = 0 ;
    $previous_server = " ";
#
# ..................................
#
#   Loop over the lines in the log file
#   and store the information on missing pages in the array tobesorted.
#
    while ( $line = <LOG> )
     {
          next if  $line !~ /^Not found/ ;
          chomp $line ;
          ($not,$found,$page,$ref,$frompage) = split(" ",$line);
          ($http,$nul,$server) = split("/",$frompage) ;
          ++$i ;
          $tobesorted[$i] = join(" ",$server,$frompage,$page);
      }

#
# ..................................
#
#   Sort the list of not_found pages

    @sorted = sort @tobesorted ;

#
# ..................................
#
#   Scan the sorted list and produce the report
#
     foreach $in (@sorted)
       {
         ($server,$frompage,$page) = split(" ",$in);
         if ( $server ne  $previous_server ) 
            {   
                if   ( $masters{$server}  eq "" )
                     { 
                        open ( MOUT, ">>@ARGV[1]") ||   die " Can't open the report file @ARGV[1] " ;
                     }
                else
                     { 
                       open ( MOUT , "|  Mail -s missing_web_pages  $masters{$server} " ) || die " Can't pipe to mailer ";
                     } ;
                print         "Server ",$server ,  "  Webmaster ", $masters{$server} ," \n" ;
                print  MOUT   "Server ",$server ,  "  Webmaster ", $masters{$server} ," \n" ;
                print         " The first line of the pair is the URL of the page which references the missing page \n";
                print  MOUT   " The first line of the pair is the URL of the page which references the missing page \n";
                print         " The second line is the URL of the missing page \n\n";
                print  MOUT   " The second line is the URL of the missing page \n\n";
                print       "    " ,$frompage  ,"     Cant find \n         ",$page, "\n\n";
                print  MOUT "    " ,$frompage  ,"     Cant find \n         ",$page, "\n\n";
                $previous_server = $server ;
            };
         print      "     " ,$frompage  ,"     Cant find \n         ",$page, "\n\n";
         print MOUT "     " ,$frompage  ,"     Cant find \n         ",$page, "\n\n";
       }
   
   


