Skip to content

Commit

Permalink
[scripts] Make retry.pl deal correctly with keyboard interrupts; make…
Browse files Browse the repository at this point in the history
… num-tries configurable. (kaldi-asr#2456)
  • Loading branch information
danpovey committed May 26, 2018
1 parent d6d49d0 commit 7ffc9dd
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 18 deletions.
3 changes: 2 additions & 1 deletion egs/wsj/s5/utils/parallel/queue.pl
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ sub caught_signal {
if ( defined $sge_job_id ) { # Signal trapped after submitting jobs
my $signal = $!;
system ("qdel $sge_job_id");
die "Caught a signal: $signal , deleting SGE task: $sge_job_id and exiting\n";
print STDERR "Caught a signal: $signal , deleting SGE task: $sge_job_id and exiting\n";
exit(2);
}
}

Expand Down
41 changes: 24 additions & 17 deletions egs/wsj/s5/utils/parallel/retry.pl
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,27 @@
# e.g. if your command line was "queue.pl [args]", you can replace that
# with "retry.pl queue.pl [args]" and it will retry jobs that failed.

my $qsub_opts = "";
my $sync = 0;
my $num_threads = 1;
my $gpu = 0;

my $config = "conf/queue.conf";

my %cli_options = ();

my $jobname;
my $jobstart;
my $jobend;
my $array_job = 0;
my $sge_job_id;
my $num_tries = 2;

sub print_usage() {
print STDERR
"Usage: retry.pl <some-other-wrapper-script> <rest-of-command>\n" .
" e.g.: retry.pl [options] queue.pl foo.log do_something\n" .
"This will retry jobs that failed (only once)\n";
"This will retry jobs that failed (only once)\n" .
"Options:\n" .
" --num-tries <n> # default: 2\n";
exit 1;
}

if ($ARGV[0] eq "--num-tries") {
shift;
$num_tries = $ARGV[0] + 0;
if ($num_tries < 1) {
die "$0: invalid option --num-tries $ARGV[0]";
}
shift;
}

if (@ARGV < 3) {
print_usage();
Expand Down Expand Up @@ -71,15 +69,24 @@ sub get_log_file {
my $log_file = get_log_file();
my $return_status;

# we may later make $num_tries configurable.
my $num_tries = 2;

for (my $n = 1; $n <= $num_tries; $n++) {
system(@ARGV);
$return_status = $?;
if ($return_status == 0) {
exit(0); # The command succeeded. We return success.
} elsif ($return_status != 256) {
# The command did not "die normally". When queue.pl and similar scripts
# detect a normal error, they exit(1), which becomes a status of 256
# in perl's $? variable.
# See http://perldoc.perl.org/perlvar.html#%24CHILD_ERROR for more info.
# An example of an abnormal death that would cause us to want to exit
# immediately, is when the user does ctrl-c or KILLs the script,
# which gets caught by 'caught_signal' in queue.pl and causes that program
# to return with exit status 2.
exit(1);
}


if ($n < $num_tries) {
if (! -f $log_file) {
# $log_file doesn't exist as a file. Maybe it was an array job.
Expand Down

0 comments on commit 7ffc9dd

Please sign in to comment.