
| Current Path : /var/www/web-klick.de/dsh/AMTC-RMS-Batch/1.2/bin/ |
Linux ift1.ift-informatik.de 5.4.0-216-generic #236-Ubuntu SMP Fri Apr 11 19:53:21 UTC 2025 x86_64 |
| Current File : /var/www/web-klick.de/dsh/AMTC-RMS-Batch/1.2/bin/bmon.pl |
#!/sw/common-os/oss/perl/5.20.3-4.0.0/bin/perl
# $URL: https://svn.photomask.com/DR_FE/RMS-Batch/branches/AMTC-RMS-Batch-1.2/bin/bmon.pl $
# $Id: bmon.pl 36 2016-05-31 08:38:18Z heydero $
package bmon;
use strict;
use warnings;
our $VERSION; # is set in bin/inc.pl
BEGIN
{
use FindBin;
my $dir = $ENV{UNITTEST} ? './bin' : $FindBin::RealBin;
my $file = "$dir/inc.pl";
do $file || die "Couldn't run $file : $@ : $!";
}
use Cwd;
use Carp;
use Pod::Usage; # exports pod2usage
use Data::Dumper; # exports Dumper()
use AppConfig;
use Log::Log4perl;
use AMTC::App::RmsBatch;
use AMTC::LSF;
use AMTC::RMS::Batch;
use AMTC::TPSM::TaskResult;
use AMTC::Logable 'INFO'; # initializes for basic screen logging
use Env 'TECHP_DATA';
our $Config; # make it accessible for unit tests
my ( $Opt_help, $Opt_man, $Opt_seconds, $Opt_tpsm, $Joblist_File );
my $Log = Log::Log4perl->get_logger;
my $RULER = '------------------------------------------------------';
# This block makes the script loadable as a module.
if ( ! caller )
{
my $rc;
read_config();
eval { $rc = __PACKAGE__->main(); };
$Log->logdie( $@ ) if $@;
exit $rc;
}
#-----------------------------------------------------------------------------
# Entry point of the script's logic. See POD.
sub main
{
( $Joblist_File ) = parse_cmdline();
if ( $Opt_help )
{
pod2usage(
-exitstatus => 0,
-msg => "Version: $VERSION\n",
);
}
pod2usage( -exitstatus => 0, -verbose => 2 ) if $Opt_man;
die "missing joblist" unless defined $Joblist_File;
reinit_logging();
my @all_jobs = AMTC::RMS::Batch::read_joblist_file( $Joblist_File );
my @not_started_jobs_report = create_not_started_jobs_report( @all_jobs );
log_lines( @not_started_jobs_report );
my %monitored_jobs = get_jobs_with_job_id( @all_jobs );
$Log->debug( "joblist = ", Dumper( \%monitored_jobs ) );
my %final_status_report =
monitor_jobs_until_all_terminated( %monitored_jobs );
my @lines = format_status_report( \%final_status_report );
log_lines( $RULER, @lines );
my $exit_code = get_exit_code( \%final_status_report );
print_message_for_tpsm( $exit_code );
return $exit_code;
}
sub print_message_for_tpsm
{
my ( $exit_code ) = @_;
return unless defined $Opt_tpsm;
my $tpsm = AMTC::TPSM::TaskResult->new;
my $event;
if ( $exit_code != 0 )
{
$event = "ERROR";
}
else
{
$event = "BMON";
$event .= "_" . $Opt_tpsm if $Opt_tpsm;
$event .= "_OK";
}
$tpsm->add_event( $event );
$tpsm->write();
}
sub monitor_jobs_until_all_terminated
{
my %monitored_jobs = @_;
my %final_status_report;
while ( keys %monitored_jobs )
{
my %lsf_jobs = query_lsf( \%monitored_jobs );
$Log->debug( "lsf_jobs = ", Dumper( \%lsf_jobs ) );
my $status_report = create_status_report_struct(
\%monitored_jobs, \%lsf_jobs );
$Log->debug( "status_report_struct = ", Dumper( $status_report ) );
foreach my $job_id ( keys %$status_report )
{
if ( $status_report->{$job_id}->{terminated} )
{
delete $monitored_jobs{$job_id};
$final_status_report{$job_id} = $status_report->{$job_id};
}
}
last unless keys %monitored_jobs;
my @lines = format_status_report( $status_report );
log_lines( $RULER, @lines );
sleep $Opt_seconds;
}
return %final_status_report;
}
sub log_lines
{
my @lines = @_;
chomp @lines;
$Log->info( $_ ) foreach @lines;
}
sub get_exit_code
{
my ( $status_report ) = @_;
foreach my $job_id ( keys %$status_report )
{
croak "Internal error: Non terminated job [$job_id]"
unless $status_report->{$job_id}->{terminated};
my $status = $status_report->{$job_id}->{job_status_string};
if ( $status =~ /^(failed|unknown)/ )
{
return 1;
}
}
return 0;
}
=begin comment
$job_id => {
job_status_string => 'running; 42.00% at 14:45 00.48e 01.06r 345kf 0b 100%c',
output_path => '/path/to/CHIP1_1_FRAC.log',
job_name => 'c2v_FRAC.CHIP_1',
reference => 'CHIP1_1',
job_id => '123456',
master_host => 'amux176',
master_pid => 30586,
terminated => 0,
}
"Job <LSF job name> [<reference>,<LSF job ID>,<master host>,<master PID>] status: <status>"
=end comment
=cut
sub format_status_report
{
my ( $status_report ) = @_;
$Log->debug( "status_report=", Dumper( $status_report ) );
my %new_status_report;
foreach my $job_id ( keys %$status_report )
{
my $old_record = $status_report->{$job_id};
my $new_record;
$new_record->{job_name} = $old_record->{job_name};
$new_record->{status} = $old_record->{job_status_string};
my @details = (
$old_record->{reference} || '',
$old_record->{job_id} || '',
$old_record->{master_host} || '',
$old_record->{master_pid} || '',
);
$new_record->{details} = join ',', @details;
$new_status_report{$job_id} = $new_record;
}
$Log->debug( "new_status_report=", Dumper( \%new_status_report ) );
my @lines;
foreach my $job_id ( keys %new_status_report )
{
my $record = $new_status_report{$job_id};
my $max_lengths =
get_max_length( \%new_status_report, qw( job_name details ) );
my $fs1 = '%-' . $max_lengths->{job_name} . 's'; # format string
my $fs2 = '%-' . $max_lengths->{details} . 's';
my $formatted = sprintf
"Job $fs1 [$fs2] %s\n",
$record->{job_name},
$record->{details},
$record->{status};
push @lines, $formatted;
}
return @lines;
}
sub get_max_length
{
my ( $status_report, @field_names ) = @_;
my %max_lengths;
foreach my $field_name ( @field_names )
{
my $max_length = 0;
foreach my $job_id ( keys %$status_report )
{
my $record = $status_report->{$job_id};
my $length = length $record->{$field_name};
$max_length = $length if $length > $max_length;
}
$max_lengths{$field_name} = $max_length;
}
return \%max_lengths;
}
# Parse the command line and return the joblist file. (It might be undef, if
# not set by the user.)
sub parse_cmdline
{
my $config = AppConfig->new({ PEDANTIC => 1 });
$config->define( "h|help" );
$config->define( "man" );
$config->define( "seconds=s", { VALIDATE => '^\d+$' } );
$config->define( "tpsm=s");
$config->define( "joblist=s" );
$config->args(); # consumes @ARGS
$Opt_help = $config->help;
$Opt_man = $config->man,
$Opt_seconds = $config->seconds || 60; # default
$Opt_tpsm = $config->tpsm || ''; # default
return $config->joblist;
}
# Initialize the $Config object and read the program's INI file.
sub read_config
{
$Config = AppConfig->new({ PEDANTIC => 1 });
$Config->define( "current_status_regex=s%" );
$Config->define( "end_marker_regex=s%" );
AMTC::App::RmsBatch::read_conf_file( $Config, 'bmon.ini' );
}
# Create a list of lines that are to be printed as a report of jobs which are
# NOT monitored.
#
# Each line includes the number of the job in the corresponding job list file
# and the job reference (if set).
#
# Example:
# Job 2 [FRAME ] not started
# Job 34 [PRIME1] not started
#
sub create_not_started_jobs_report
{
my @jobs = @_;
my $max_reference_length;
{
my $max_length = 0;
foreach my $job ( @jobs )
{
# skip undef/empty job_ID's
my $job_id = $job->{rms}->{job_ID};
$job_id = '' unless defined $job_id;
next unless $job_id eq '';
# determine the string length of 'reference'
my $reference = $job->{reference} || '';
my $length = length $reference;
$max_length = $length if $length > $max_length;
}
$max_reference_length = $max_length;
}
my @lines;
my $num = 0;
foreach my $job ( @jobs )
{
$num++;
# skip undef/empty job_ID's
my $job_id = $job->{rms}->{job_ID};
$job_id = '' unless defined $job_id;
next unless $job_id eq '';
# format the line of a not started job
my $reference = $job->{reference} || '';
my $line =
sprintf "Job %3s [%-${max_reference_length}s] not started\n",
$num, $reference;
push @lines, $line;
}
return @lines;
}
# Filter the given list of jobrefs as returned by routine 'read_joblist_file',
# and return a hash whose keys are 'job_ID's mapping to the corresponding
# jobref.
sub get_jobs_with_job_id
{
my @jobs = @_;
my %rv;
foreach my $job ( @jobs )
{
my $job_id = $job->{rms}->{job_ID};
$rv{$job_id} = $job if defined $job_id;
}
return %rv;
}
# Query LSF for the status of the given jobs and return a hash whose keys
# are LSF job_IDs which refer to AMTC::LSF::Job objects.
#
# The input parameter is a hashref as returned by the routine
# 'get_jobs_with_job_id'.
#
# Any error is fatal.
#
# If no status for a given job could be retrieved from LSF, then that job is
# NOT referenced in the returned hash! (FYI: This may happen, if the job has
# finished, but is old enough in order to have "fallen out" of the active
# reporting. Such jobs could still be found by means of "bjobs", but this is
# not implemented with AMTC::LSF yet.)
#
sub query_lsf
{
my ( $jobsref ) = @_;
my $lsf = AMTC::LSF->new;
my %status;
my @job_ids = keys %{ $jobsref };
my @lsf_jobs = $lsf->query_jobs( { job_id => \@job_ids } );
my %rv;
foreach my $lsf_job ( @lsf_jobs )
{
$rv{ $lsf_job->job_id } = $lsf_job;
}
return %rv;
}
=begin comment
$job_id => {
job_status_string => 'running; 42.00% at 14:45 00.48e 01.06r 345kf 0b 100%c',
output_path => '/path/to/CHIP1_1_FRAC.log',
job_name => 'c2v_FRAC.CHIP_1',
reference => 'CHIP1_1',
job_id => '123456',
master_host => 'amux176',
master_pid => 30586,
terminated => 0,
}
=end comment
=cut
sub create_status_report_struct
{
my ( $jobs, $lsf_jobs ) = @_;
my %status_report;
foreach my $job_id ( keys %$jobs )
{
my $lsf_job = $lsf_jobs->{ $job_id };
# Convention: bstart.pl always sets/updates the
# "actual_output_path" in the job list file for each successful
# job start.
my $output_path = $jobs->{$job_id}->{actual_output_path};
my $script_type = $jobs->{$job_id}->{script_type};
# init record
my $record = {
job_status_string => 'unknown',
output_path => $output_path,
job_name => '',
reference => $jobs->{$job_id}->{reference} || '',
job_id => '',
master_host => '',
master_pid => '',
terminated => 0,
};
$status_report{$job_id} = $record;
if ( defined $lsf_job )
{
my $rinfo = $lsf_job->runtime_info;
$record->{job_status_string} = lc $rinfo->job_status_string;
$record->{job_name} = $rinfo->job_name;
$record->{job_id} = $rinfo->job_id;
$record->{master_host} = $rinfo->first_exec_host;
$record->{master_pid} = $rinfo->job_pid;
$Log->debug(
"job [$job_id] status: '$record->{job_status_string}'" );
my $status = $rinfo->job_status;
if ( $status & &PENDING )
{
# do nothing
}
elsif ( $status & &RUNNING )
{
if ( $output_path )
{
my $fracture_status = get_fracture_status(
$output_path, $script_type );
$record->{job_status_string} .= "; $fracture_status";
}
}
elsif ( $status & &DONE )
{
$record->{terminated} = 1;
# also verify the corresponding log (if any)
if ( defined $output_path )
{
if ( ! -e $output_path )
{
$record->{job_status_string} =
"unknown; no log available";
}
else
{
my $marker_exists = end_marker_exists(
$output_path, $script_type );
if ( defined $marker_exists && ! $marker_exists )
{
$record->{job_status_string} =
"unknown; no completion marker in log"
}
}
}
}
elsif ( $status & &FAILED )
{
$record->{terminated} = 1;
}
else
{
$record->{job_status_string} .=
"; job might have been suspended";
}
}
else # assume the job has "fallen out" of the active LSF job reporting
{
$Log->debug( "job [$job_id] status: not reported" );
$record->{terminated} = 1;
if ( defined( $output_path ) && -e $output_path )
{
my $marker_exists = end_marker_exists(
$output_path, $script_type );
if ( $marker_exists )
{
$record->{job_status_string} = "done";
}
elsif ( ! defined $marker_exists )
{
$record->{job_status_string} = "unknown";
}
else
{
$record->{job_status_string} = "failed";
}
}
else
{
$record->{job_status_string} =
"unknown; no log available";
}
}
}
return \%status_report;
}
# Inspect the last line of the given file, and return that last line
# (w/o nl char) if and only if it matches a regular expression that has
# been defined for the given script type. Otherwise return an empty string.
#
# Example: " 42.00% at 14:45 00.48e 01.06r 345kf 0b 100%c"
sub get_fracture_status
{
my ( $log_file, $script_type ) = @_;
return '' unless $script_type;
my $last_line = `tail -1 $log_file 2>/dev/null`;
$last_line =
get_matching_fracture_status_line( $last_line, $script_type ) || '';
chomp $last_line;
return $last_line;
}
=begin comment
Returns the given line as is, if the head of the line looks like...
3.00%
A typical block of progress status lines during a CATS fracture looks like
this...
1.00% at 11:00 00.02e 03.17r
2.00% at 11:00 00.04e 03.15r 174f 6960b 0%c
3.00% at 11:00 00.06e 03.13r 353f 14.1kb 0%c
4.00% at 11:01 00.08e 03.11r 421f 16.8kb 0%c
5.00% at 11:01 00.10e 03.09r 490f 19.6kb 0%c
=end comment
=cut
sub get_matching_fracture_status_line
{
my ( $line, $script_type ) = @_;
return unless defined $script_type;
# see regex in bmon.ini
my $regex = $Config->current_status_regex->{$script_type};
return unless defined $regex;
return unless $line =~ /$regex/;
return $line;
}
# Return true if a so-called end marker exists in the given $output_path.
#
# The end marker is a string in the given file (output_path) which matches
# a regular expression as defined in the bmon.ini file. (See variable
# definition of "end_marker_regex").
#
# Return undef, if no matching "end_marker_regex" is defined in bmon.ini,
# or return 0, if the "end_marker" regex is defined, but no matching lines
# were found in the given $output_path.
sub end_marker_exists
{
my ( $output_path, $script_type ) = @_;
return unless defined $script_type;
# see regex in bmon.ini
my $regex = $Config->end_marker_regex->{$script_type};
return unless defined $regex;
my @lines = AMTC::FileUtils::read_file( $output_path );
my $end_marker_exists = grep { /$regex/ } @lines;
}
sub reinit_logging
{
return if $ENV{UNITTEST};
# (1) Initialize log4perl from ini-file
my $cfile = "$TECHP_DATA/AMTC-RMS-Batch/bmon-log4perl.ini";
if ( -e $cfile )
{
$Log->debug( "Re-init log4perl using $cfile" );
Log::Log4perl->init( $cfile );
}
$Log = Log::Log4perl->get_logger;
# (2) If in TPSM-Mode, then create a session log file
if ( defined $Opt_tpsm )
{
my $logfile =
AMTC::App::RmsBatch::create_logfile_name( $Joblist_File );
AMTC::App::RmsBatch::open_session_logfile( $Log, $logfile );
}
}
1;
__END__
=pod
=head1 NAME
bmon.pl - Batch Monitor for LSF jobs
=head1 USAGE
bmon.pl [-h] [-man] [-seconds seconds] [-tpsm identifier] -joblist file
=head1 REQUIRED ARGUMENTS
The batch monitor has no required arguments. However the "option" C<-joblist>
is mandatory instead.
If you feel this is weird, then you are right. Unfortunately in order to make
the batch starter able to be used with the TPSM, the batch starter's command
line interface is enforced to follow that TPSM design constraint.
Again, the batch monitor requires one mandatory "option"; the so-called job
list file. That file is expected to be an XML document which describes LSF
jobs. An XML parser validates the given file. See the man page of
C<AMTC::RMS::Batch> in the C<amtc-common-perl> package for the documentation
of the XML structure.
=head1 OPTIONS
=head2 -h
Print a short help text with version, usage and options.
=head2 -man
Print the man page.
=head2 -seconds
Set the amount of time in seconds how long the batch monitor shall wait until
LSF is queried again as long there are non terminated jobs.
=head2 -tpsm session_identifier
Write some TPSM specific XML to STDOUT when the program finishes. This option
is only necessary if the batch monitor is to be integrated into a TPSM flow.
If the bmon session ran without error, then the session identifier is
embedded in the event string for the TPSM. Otherwise the event string is
"ERROR".
Example of the XML (in case the session identfier is "FOO"):
<RESULT>
<EVENTS>
<EVENT id="BMON_FOO_OK" />
</EVENTS>
<OUTPUTS />
</RESULT>
=head1 DESCRIPTION
The batch monitor frequently queries LSF to report the status of a set of LSF
jobs. Once all jobs of interest have finished the batch monitor prints
a summary of the end status of all jobs and terminates. The reports and the
summary is written by means of log4perl. An optional configuration file can
be used to control how the program output is logged. (See the configuration
section below.)
The batch monitor reads a job list file to determine which jobs are to be
monitored. (See also the required arguments section above.) If the given job
list file is valid, then all jobs of that list which have a job ID will be
monitored.
From the batch monitor's perspective a job can
have one the following statuses:
=over 4
=item PENDING
=item RUNNING
=item DONE
=item FAILED
=back
Beyond retrieving a job's status from LSF various call back handlers might be
called in order to inspect a job more thoroughly. (See the call back
handlers section below.)
=head1 REPORT FORMAT
The batch monitor starts with reporting all jobs which are B<not> monitored.
Each line includes the number of the job in the corresponding job list file
and the job reference (if set).
Example:
Job 2 [FRAME ] not started
Job 34 [PRIME1] not started
After the initial block of not started jobs (which may be empty) a block
of job statuses is frequently reported. Each job that has been reported as
either I<DONE> or I<FAILED> will not be reported any more until all monitored
jobs have finished and a summary of all jobs is printed. The effect of this
behavior is that a potentially long list of jobs may become shorter with each
report block, because only I<done> and I<failed> jobs are reported only once.
A report block of such monitored jobs is a set of lines describing the status
of each monitored job and some of its RMS runtime attributes. The following
attributes are reported:
=over 4
=item LSF Job Name
=item Job Reference (as set in the job list file)
=item LSF Job ID
=item Master Host
=item Master PID
=item Job Status
=back
Example:
Job PRIME02_85.CATS_Fracture_TCL [PRIME02_85,71596,amux129,11291] done
Job ALIGN2_0.CATS_Fracture_TCL [ALIGN2_0,71602,amux131,13357 ] failed
=head1 CALL BACK HANDLERS
If a job has the attributes I<output_path> and I<script_type> (in the job list
file), then a type specific call back handler inspects the given
I<output_path> (file) in order to get application/type specific information
about the monitored job. For example, LSF reports a running job, but it does
not tell anything about what the job is actually doing. However the running
job may frequently write some percentage of completion to its I<output_path>.
This kind of information can be extracted by the handler in order to report
it.
Also, after a job has finished the handler reads the I<output_path> to
determine the final end status of a job. For example, LSF might have reported
a job as I<DONE>, because the job's exit code was 0. If the corresponding
I<output_path> (file) indicates some errors, then the handler may decide to
change the job's end status to I<FAILED>. It depends on the rules that are
implemented in the I<script_type> specific handler.
Currently the handlers are controlled by means of regular expressions which
are configured as describted in the CONFIGURATION section.
=head1 CONFIGURATION AND ENVIRONMENT
The program can be controlled via command line options, an optional
configuration file and a log4perl configuration file.
=head2 Configuration file
An optional configuration file may control the program's behavior. A
configuration entry is called a directive. See the documentation of the CPAN
module C<AppConfig> for the syntax description.
All defined directives are described below.
=head3 current_status_regex
This directive is used to match lines of an "output_path" file (aka logfile)
which indicate the current status of a given job. The type of the directive
is C<multiple hash value>. The key of the directive C<current_status_regex>
is interpreted as a "script_type" and the value as a regex to be used when
inspecting the "output_path" file of the job.
Example:
current_status_regex CATS_Fracture_TCL = ^\s+\d+\.00%
=head3 end_marker_regex
This directive is used to determine the success of a given job based on the
existence of a so-called end marker in the corresponding "output_path" file
(aka logfile). The type of the directive is C<multiple hash value>.
LSF might report a job as DONE (i.e. successful). However if the corresponding
"output_path" file has no end marker string, then the batch monitor reports
the job as being FAILED.
The key of the directive C<end_marker_regex> is interpreted as a "script_type"
and the value as a regex to be used when inspecting the "output_path" file of
the job.
Example:
end_marker_regex CATS_Fracture_TCL = ## PROCESS_TAG = 'FRACTURE_COMPLETE' ##
=head2 Log4perl configuration
An optional log4perl configuration file can be used to control the output
of the program. The program tries to read the configuration file from
$TECHP_DATA/AMTC-RMS-Batch/bmon-log4perl.cf
If it fails, then the output is written to STDOUT in a default format.
(timestamp and log level followed by the status information.)
=head1 EXIT STATUS
The batch monitor returns the exit status 0 if and only if all monitored jobs
have terminated with the end status DONE. This means implicitely that the
batch monitor returns 0 if no job was to be monitored at all. Otherwise an
exit status different from 0 is returned.
=head1 DEPENDENCIES
=over 4
=item LSF 9.1
=item Log::Log4perl 1.46
=item XML::LibXML 2.0118
=item AMTC::LSF 1.0
=item AppConfig 1.68
=item XML::Simple 2.20
=item amtc-common-perl 1.003
=back
=head1 AUTHOR
Olaf Heyder E<lt>heydero@drux25E<gt> or E<lt>olaf.heyder@amtc-dresden.comE<gt>
If you found a bug, please talk to MDP first. It might be a feature. If not,
then please report it using L<https://helpdesk.photomask.com/>.
=head1 COPYRIGHT
Copyright (C) 2016 by Advanced Mask Technology Center GmbH & Co. KG
This script is for AMTC business use only. All information is confidential.
=cut