#!/usr/bin/env perl
#----------------------------------------------------------------------------
#
# Name: split_ingest_dlg.pl
#
# This is an interactive tool that breaks an ingest daily log file (DLG) into
# separate files named after their Ingest group name (association name or 
# dataset name) and group_data_id, using the format: 
# <group_name>_<group_data_id>.log. It also splits log lines after column 80.
#
# Account requirements:
#
# The account using this progrom must use the opus_login.csh script
# to set up the required ENV variables for this script.
#
# Command Usage: (see $usage definition below)
#
#  Return values:
#     0 - success
#     1 - failure
#
# ENV variables:
#   ARCH_SERVER
#   ARCH_DB
#    
# History:
# Date     OPR      Who         Reason
# -------- -------- ----------  ---------------------------------------------
# 06/03/04 51332    Baum        Initial code
# 03/25/10 64274    MSwam       Replace ST_DBlib with DBI
# 06/30/10 64432    MSwam       Use single quotes for SQLServer
# 09/18/12 72255    Sherbert    get rid of DSQUERY
#----------------------------------------------------------------------------
# set up external routines
unshift @INC,(split /:/, $ENV{PATH});
require 'do_dbi_pkg.pl';        # database query calls

# begin 
    #specify exit status values
    $EXIT_FAILURE =      1;   # exit status for errors
    $EXIT_SUCCESS =      0;   # exit status for success
    # specify other constants
    $true  = 1;
    $false = 0;
    $start_label = "----- RMGR start";
    $query_label = "QUERY/LOG mode";
    $group_label = "GROUP mode";
    
# check ENV variables used for queries
    $ARCH_SERVER     = $ENV{"ARCH_SERVER"};
    $ARCH_DB         = $ENV{"ARCH_DB"};
    if (!defined($ARCH_SERVER) || !defined($ARCH_DB)) {
       print "Missing ENV variables: ARCH_SERVER or ARCH_DB.\n";
       exit( $EXIT_FAILURE);
    }
# define command usage
    $usage = <<"EOM";    
Usage:
>split_ingest_dlg.pl [-a] [-i in_dir] [-o out_dir] [-n log_name] group_name

  Where: 
             -a = (optional): add install logs to group logs;
             -i = (optional): to specify daily log file directory, else 
                  ingest_data_set_info record must still exist for log file or
                  log file must reside in default directory;
             -o = (optional): to specify directory for output files, else
                  the default directory is used;
             -n = (required if group_name has a wildcard or query using
                  group_name finds multiple ids_log_file_name values in
                  ingest_data_set_info table or finds multiple
                  ads_completion_time values in archive_data_set_all table):
                  to specify daily log name - ingest_yyyy_mm_dd.log;
     group_name = (required): "%" or "<full_group_name>" or "<partial_name>%",
                  where % is used as a wildcard character.
  
  Examples:
  
      To divide entire contents of daily log for day 2004_03_12 into group
      files to be put in the default directory, where the log file can be found
      in the default directory, or where the log file remains in the Ingest 
      cache, use:
      
      >split_ingest_dlg.pl -a -n ingest_2004_03_12.log %
      
      To divide a retrieved ingest daily log residing in the user specified 
      directory into group files, without installation logs, to be put in the
      default directory, excluding the install log files, use:
       
      >split_ingest_dlg.pl -i ../data/ -n ingest_2004_03_12.log %
      
      To extract specified group logs that include the install logs,
      where the log file remains in Ingest cache and it has an 
      ingest_data_set_info record, use:

      >split_ingest_dlg.pl -a ok5401n010

      Note that if this group name, ok5401n010, has multiple data classes in 
      the same log file multiple logs will be generated having different group
      data_id values. If multiple logs files are found for different classes 
      the logs and class names will be listed so the user may reenter the 
      command with a -n option to specify the log file with the desired class
      of data.

      To extract a set of group logs containing install logs for all ACS
      groups in a given daily log, use:
      
      >split_ingest_dlg.pl -a -n ingest_2004_03_12.log j%
     
EOM
    # start argument checks
    $num_args = scalar @ARGV;

    if ($num_args < 1) {    
        print $usage;
        exit ($EXIT_FAILURE);
    }
    # set default values
    $all_log_types = $false;
    $indir = ".";
    $indir_specified = $false;
    $outdir = ".";
    $log_file_specified = $false;
    $group_name_specified = $false;
    $group_name_wildcard = $false;
    $group_name_all = $false;

    # check all arguments
    while (scalar @ARGV) {
      $arg = shift @ARGV;
      if ($arg eq "-a") {
        $all_log_types = $true;
      } elsif ($arg eq "-i") {
        $indir = shift @ARGV;
        # remove any trailing slash
        $indir =~s#/$##;
        # verify that directory exists
        if (!(-d $indir)) {
           print $usage;
           print "Error - $indir is not a directory\n";
           exit ($EXIT_FAILURE);
        }
        $indir_specified = $true;
      } elsif ($arg eq "-o") {
        $outdir = shift @ARGV;
        # remove any trailing slash
        $outdir =~s#/$##;
        # verify that directory exists
        if (!(-d $outdir)) {
          print $usage;
          print "Error - $outdir is not a directory\n";
          exit ($EXIT_FAILURE);
        }
      } elsif ($arg eq "-n") {
        $log_file = shift @ARGV;
        $log_file_specified = $true;
        # verify log file format
        if (!($log_file =~m/^ingest_\d{4}_\d{2}_\d{2}\.log/)) {
          print $usage;
          print "Error - $log_file is not of the form: ingest_yyyy_mm_dd.log\n";
          exit ($EXIT_FAILURE);
        }
      } else {
        $group_name = $arg;
        $group_name_specified = $true;
        
        #check that group name is not an invalid option
        if ($group_name =~m/^-/) {
          print $usage;
          print "Error - Invalid option $group_name\n";
          exit ($EXIT_FAILURE);
        }        
        # check that any wildcard present is at end of string
        $indx = index( $group_name, "%");
        if ($indx > -1) {
          $group_name_wildcard = $true;
          if ($indx < (length( $group_name) -1)) {
            print $usage;
            print "Error - Invalid wildcard (%) location in $group_name\n";
            exit ($EXIT_FAILURE);
          }
          if ($indx == 0) {
             $group_name_all = $true;
          } else {
            # strip off trailing wildcard
            $group_name = substr($group_name,0,$indx);
          }
        }
      }
    }
    if (!$group_name_specified) {
      print $usage;
      print "Error - group_name is required argument\n";
      exit ($EXIT_FAILURE);
    }
    if ($group_name_wildcard && !$log_file_specified) {
      print $usage;
      print "Error - group_name wildcard not allowed unless log file_name ".
        "is specified\n";
      exit ($EXIT_FAILURE);
    }   
    # open database for queries
    $db = DoDBIopen( $ARCH_SERVER, $ARCH_DB, $EXIT_FAILURE);

    # append trailing slash to directory names
    $indir .= "/";
    $outdir .= "/";

    if (!$log_file_specified) {
       # use group name to find log file
       $log_file = query_log_file_by_group();
    }
    $log_fspec = $indir.$log_file;
    if (!$indir_specified) {
       # see if logfile is present in default dir
       if (!(-e $log_fspec)) {
          # find log file directory in ingest_data_set_info
          $indir = query_indir_by_log_file();
          $log_fspec = $indir.$log_file;
       } 
    } 
    # at this point $indir and $log_file is known or the script has exited
    if (!(-e $log_fspec)) {
       ErrorExit("cannot find $log_fspec.");
    }
    # open input file
    if (!open (LOGFILE,"<$log_fspec")) {
       ErrorExit("cannot open $log_fspec for input.");
    } 
    print "Processing $log_fspec to output directory $outdir\n";

    if ($all_log_types) {
       print "Installation logs will be merged with group logs.\n";
    } else {
       print "Only group logs with be output.\n";
    }
    if ($group_name_wildcard) {
      if ($group_name_all) {
         print "All group names are included.\n";
      } else {
         print "Only group names matching prefix $group_name are included.\n";
      }
    } else {
      print "Group name must completely match $group_name.\n";
    }
    $out_count = process_log_file();
    print "Created $out_count files in $outdir.\n";
    close LOGFILE;
    DoDBIclose($db);
    undef $db;     
    exit( $EXIT_SUCCESS);  
#----------------------------------------------------------------------------
sub ErrorExit {   
   # one argument - the error message without newline
   # exit the script with the error condition after closing the $db database
   # objest and writing the error message to the log file.
   
   my ($msg) = @_;
   if (defined($db)) {
      DoDBIclose($db);
   }
   print "Error - ".$msg."\n";
   exit ( $EXIT_FAILURE);
}
#-----------------------------------------------------------------------------
sub query_indir_by_log_file {  # no args
   # gets dataset name from global $log_file
   # purpose: query ids_path_name from ingest_data_set_info table
    
   my $ext_idx = index( $log_file, ".log");
   my $log_dataset = uc (substr($log_file,0,$ext_idx));
   
   my $query =<<EOQ;
SELECT ids_path_name FROM ingest_data_set_info
WHERE ids_data_set_name = '$log_dataset' and ids_mission = 'HST' and
ids_archive_class = 'DLG' and ids_generation_date =
   (SELECT MAX(ids_generation_date) FROM ingest_data_set_info
    WHERE ids_data_set_name = '$log_dataset' and ids_mission = 'HST' and
    ids_archive_class = 'DLG')
EOQ

   print "Query log directory from ids_path_name.\n"; 
   my @record = DoDBIselect ($db, $query);
   if (!defined( $record[0])) {
     ErrorExit("cannot find ingest_data_set_info record for $log_dataset.");
   }
   my $path_name = $record[0];
   print "Found log directory: $path_name.\n";
   $path_name
}
#-----------------------------------------------------------------------------
sub query_log_file_by_group {  # no args
   # purpose: query ids_log_file_name from ingest_data_set_info, reporting all
   # entries found with their class name. If only one log name entry found then
   # return it, else exit.
   
   my $log_file_name;
   my $dataset = uc($group_name);
   my $query =<<EOQ;
SELECT DISTINCT ids1.ids_log_file_name, ids1.ids_archive_class 
FROM ingest_data_set_info ids1
WHERE ids_data_set_name = '$dataset' and ids_generation_date =
   (SELECT MAX(ids_generation_date) FROM ingest_data_set_info ids2
    WHERE ids2.ids_data_set_name = '$dataset' and
    ids2.ids_archive_class = ids1.ids_archive_class)
EOQ
   my $err_msg = "Cannot access first ingest_data_set_info record";
   my $err_msg2 = "Cannot access next ingest_data_set_info record";
   my $count = 0;

   print "Query log name from ids_log_file_name:\n";     

   $sth = DoDBIexecute( $db, $query);
   while (  @record = DoDBIfetch( $db, $query, $sth) ) {
      print "Found log name: $record[0] for class $record[1]\n";
      $log_file_name = $record[0];
      $count += 1;
   }
   if ($count == 0) {
      print "No ids_log_file_name values returned.\n";
      $log_file_name = query_log_file_from_ads($dataset);
   } elsif ($count > 1) {
     # too many records returned
     print "Error - cannot find unique log name in ingest_data_set_info - \n";
     ErrorExit(" use option <-n log_name> to specify log file.");
   }
   $log_file_name;
}
#-----------------------------------------------------------------------------
sub query_log_file_from_ads {  # one arg - dataset
   # purpose: query ads_completion_time from archive_data_set_all table and
   # convert times to ingest log file names. Report all name and classes.
   # If only one log name entry found then return it, else exit.
   
   my ($dataset) = @_;
   my $log_file;
   my $query =<<EOQ;
SELECT DISTINCT CONVERT( varchar,ads_completion_time, 102), ads_archive_class 
FROM archive_data_set_all ads1
WHERE ads_data_set_name = '$dataset' and ads_generation_date =
   (SELECT MAX(ads_generation_date) FROM archive_data_set_all ads2
    WHERE ads2.ads_data_set_name = '$dataset' and
    ads2.ads_archive_class = ads1.ads_archive_class)
EOQ
 
   my $err_msg = "Cannot access first ingest_data_set_info record";
   my $err_msg2 = "Cannot access next ingest_data_set_info record";
   my $count = 0;
   print "Deriving log name from ads_completion_time:\n";  

   $sth = DoDBIexecute( $db, $query);
   while (  @record = DoDBIfetch( $db, $query, $sth) ) {
      $log_file = $record[0];
      # convert date to log file name
      $log_file =~s/^(\d{4})\.(\d{2})\.(\d{2})/ingest_$1_$2_$3.log/; 
      print "Derived log name: $log_file for class $record[1]\n";
      $count += 1;
   }
   if ($count == 0) {
     ErrorExit("cannot find $dataset in archive_data_set_all.");
   } elsif ($count > 1) {
     # too many records returned
     print "Error - cannot find unique log date in archive_data_set_all - \n";
     ErrorExit(" use option <-n log_name> to specify log file.");
   }
   $log_file;
}
#-----------------------------------------------------------------------------
sub process_log_file {  # no args
   # purpose: control processing of log file

   local @in_buffer = ();  # define empty local buffer of multiple input lines 
                           # that can be seen by lower subroutines
   local $out_file_cnt = 0;
   local $end_of_file = $false;
   local $grp_name;
   local $grp_data_id;
   
   get_next_log();
   
   while (!$end_of_file) {
      if (log_file_wanted()) {
         write_log();
      }
      get_next_log();
   }
   $out_file_cnt;
}
#-----------------------------------------------------------------------------
sub get_next_log {  # no args
   # purpose: get the first four lines of the next RMGR log but test if first
   # line is already present
   my $inlog_line;
   while (!$end_of_file && !(scalar @in_buffer)) {
      # find start of RMGR log
      $inlog_line = <LOGFILE>;
      if (!defined($inlog_line)) {
         $end_of_file = $true;
      } else {
         if (index( $inlog_line, $start_label) > 0) {
            push @in_buffer, $inlog_line;
         }
      }
   }
   if (!$end_of_file) {
      # at this point there should only be the RMGR start record in in_buffer
      # so add three more records
      for (my $i= 0; $i < 3; $i++) {
         $inlog_line = <LOGFILE>;
         if (!defined($inlog_line)) {
            $end_of_file = $true;
            last;
         } else {
            push @in_buffer, $inlog_line;
         }
      }
   }
}
#-----------------------------------------------------------------------------
sub log_file_wanted {  # no args
   # purpose: return $true if RMGR records are valid and the log is wanted.
   # Also, update grp_name name grp_data_id that are found in the four buffer 
   # lines.
   my $wanted = $false;
   # check for valid in_buffer
   if (scalar @in_buffer == 4) {
      if (index( $in_buffer[0], $start_label) > 0) {
         if (index( $in_buffer[1], $group_label) > 0) {
            # parse start line for group ids
            my $delim_idx = rindex($in_buffer[0],"_");
            my $space_idx = rindex($in_buffer[0],": ");
            if ($delim_idx < 0 || $space_idx < 0) {
               print "Warning: invalid start line- $in_buffer[0]\n";
            } else {
               my $grp_idx = $delim_idx+1;
               $grp_data_id = substr($in_buffer[0],$grp_idx, 3);
               $space_idx+=2;
               $grp_name = substr($in_buffer[0],$space_idx, 
                  ($delim_idx - $space_idx));
               $wanted = group_wanted();
            }
         } elsif (index( $in_buffer[1], $query_label) > 0) {
            # this is an installation log
            if ($all_log_types) {
               # This type of log may be included in output so
               # parse 3rd and 4th line for group name and id
               my $grp_line = $in_buffer[2];
               chop $grp_line;
               my $grp_prefix_idx = index($grp_line, ": ");
               my $did_line = $in_buffer[3];
               chop $did_line;
               my $did_prefix_idx = index($did_line, ": ");
            
               if ($grp_prefix_idx < 0) {
                  print "Warning: invalid group line- $grp_line\n";
               } elsif ($did_prefix_idx < 0) {
                  print "Warning: invalid data_id line- $did_line\n";
               } else {
                  $grp_name = substr($grp_line,($grp_prefix_idx + 2));
                  $grp_data_id = substr($did_line,($did_prefix_idx + 2)); 
                  $wanted = group_wanted();
               }
            }
         } else {
            print "Warning: invalid label line- $in_buffer[1]\n";
         }
      } else {
         print "Warning: invalid start line- $in_buffer[0]\n";
      }
   } else {
      print "Warning: invalid buffer line count = ".(scalar @in_buffer)."\n";
   }
   if (!$wanted) {
      # clean up in_buffer for lines not wanted for output
      while (scalar @in_buffer) {
         pop @in_buffer;
      }
   }
   $wanted;
}
#-----------------------------------------------------------------------------
sub group_wanted {  # no args
   # Purpose: Testing global variables, return $true if $grp_name is wanted.
   
   my $wanted = $false;
   
   if ($group_name_all) {
      $wanted = $true;
   } else {
      if ($group_name_wildcard) {
         if (index( $grp_name, $group_name) == 0) {
            $wanted = $true;
         }
      } else {
         if ($grp_name eq $group_name) {
            $wanted = $true;
         }
      }
   }
   $wanted;
}
#-----------------------------------------------------------------------------
sub write_log {  # no args
   # purpose: if log already exists, then append to it, else create new file.
   # Empty input buufer into output file, then write input lines until EOF or
   # new log start line is found.
   
   my $out_file = $outdir.$grp_name."_".$grp_data_id.".log";
   my $output_line;
   
   if (-e $out_file) {
     # open existing file file for appending
     if (!open (OUTFILE,">>".$out_file)) {
        ErrorExit("cannot open to append to the old file: $out_file");
     }
   } else {
     # open existing file file for appending
     if (!open (OUTFILE,">".$out_file)) {
        ErrorExit("cannot open new file: $out_file");
     }
     $out_file_cnt += 1;
   }
   # empty buffer into out file, leaving empty buffer
   while (scalar @in_buffer) {
      write_line(shift @in_buffer);
   }
   #copy input line to out file until eof or new start line is found
   while (!$end_of_file) {
      $output_line = <LOGFILE>;
      if (!defined( $output_line)) {
         $end_of_file = $true;
      } elsif (index($output_line, $start_label) > 0) {
         # a new log file is found
         push @in_buffer, $output_line;  # save for next log
         last;
      } else {
         write_line( $output_line);
      }
   }
   close OUTFILE;
}
#-----------------------------------------------------------------------------
sub write_line {  # one args - output line
   # Split long line at special character or white space, indenting continued 
   # lines.
   my ($line) = @_;
   my $line_part1;
   my $line_part2;
   my ($front,$back);
    
   while ((length $line) > 0) {
      if ((length $line) <= 81) { 
         print OUTFILE $line;
         $line = "";
      } else {
         # split line at column 80
         $line_part1 = substr($line,0,80);
         $line_part2 = substr($line,80);
         # check if break is at a special character
         if (($line_part1 =~/\W$/) || ($line_part2 =~/^\W/)) {
            # this is a good place to split the line
            print OUTFILE $line_part1."\n";
            $line = "  ".$line_part2;
         } else {
            # avoid splitting a word - find a better break point
            ($front,$back) = ($line_part1=~/(.*\W)(\w+)$/);
            if (!defined($back)) {
               # cannot split line at white space or special char
               print OUTFILE $line_part1."\n";
               $line = "  ".$line_part2;
            } else {
               print OUTFILE $front."\n";
               $line = "  ".$back.$line_part2;
            }
         }
      }
   }
}