From 7ffc9ddeb3c8436e16aece88364462c89672a183 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 26 May 2018 18:01:10 -0400 Subject: [PATCH] [scripts] Make retry.pl deal correctly with keyboard interrupts; make num-tries configurable. (#2456) --- egs/wsj/s5/utils/parallel/queue.pl | 3 ++- egs/wsj/s5/utils/parallel/retry.pl | 41 +++++++++++++++++------------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/egs/wsj/s5/utils/parallel/queue.pl b/egs/wsj/s5/utils/parallel/queue.pl index 1165808c092..e14af5ef6e3 100755 --- a/egs/wsj/s5/utils/parallel/queue.pl +++ b/egs/wsj/s5/utils/parallel/queue.pl @@ -94,7 +94,8 @@ sub caught_signal { if ( defined $sge_job_id ) { # Signal trapped after submitting jobs my $signal = $!; system ("qdel $sge_job_id"); - die "Caught a signal: $signal , deleting SGE task: $sge_job_id and exiting\n"; + print STDERR "Caught a signal: $signal , deleting SGE task: $sge_job_id and exiting\n"; + exit(2); } } diff --git a/egs/wsj/s5/utils/parallel/retry.pl b/egs/wsj/s5/utils/parallel/retry.pl index e21fe0103b9..a039d6f5a74 100755 --- a/egs/wsj/s5/utils/parallel/retry.pl +++ b/egs/wsj/s5/utils/parallel/retry.pl @@ -14,29 +14,27 @@ # e.g. if your command line was "queue.pl [args]", you can replace that # with "retry.pl queue.pl [args]" and it will retry jobs that failed. -my $qsub_opts = ""; -my $sync = 0; -my $num_threads = 1; -my $gpu = 0; -my $config = "conf/queue.conf"; - -my %cli_options = (); - -my $jobname; -my $jobstart; -my $jobend; -my $array_job = 0; -my $sge_job_id; +my $num_tries = 2; sub print_usage() { print STDERR "Usage: retry.pl \n" . " e.g.: retry.pl [options] queue.pl foo.log do_something\n" . - "This will retry jobs that failed (only once)\n"; + "This will retry jobs that failed (only once)\n" . + "Options:\n" . + " --num-tries # default: 2\n"; exit 1; } +if ($ARGV[0] eq "--num-tries") { + shift; + $num_tries = $ARGV[0] + 0; + if ($num_tries < 1) { + die "$0: invalid option --num-tries $ARGV[0]"; + } + shift; +} if (@ARGV < 3) { print_usage(); @@ -71,15 +69,24 @@ sub get_log_file { my $log_file = get_log_file(); my $return_status; -# we may later make $num_tries configurable. -my $num_tries = 2; - for (my $n = 1; $n <= $num_tries; $n++) { system(@ARGV); $return_status = $?; if ($return_status == 0) { exit(0); # The command succeeded. We return success. + } elsif ($return_status != 256) { + # The command did not "die normally". When queue.pl and similar scripts + # detect a normal error, they exit(1), which becomes a status of 256 + # in perl's $? variable. + # See http://perldoc.perl.org/perlvar.html#%24CHILD_ERROR for more info. + # An example of an abnormal death that would cause us to want to exit + # immediately, is when the user does ctrl-c or KILLs the script, + # which gets caught by 'caught_signal' in queue.pl and causes that program + # to return with exit status 2. + exit(1); } + + if ($n < $num_tries) { if (! -f $log_file) { # $log_file doesn't exist as a file. Maybe it was an array job.