Skip to content

Commit

Permalink
Removing changes to split_scp.pl (kaldi-asr#3717)
Browse files Browse the repository at this point in the history
  • Loading branch information
danpovey committed Nov 14, 2019
1 parent 898c8da commit f679c78
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 156 deletions.
8 changes: 8 additions & 0 deletions egs/wsj/s5/utils/filter_scps.pl
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@

# Some variables that we set to produce a warning.
$warn_uncovered = 0;
$warn_multiply_covered = 0;

for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
$idlist_n = $idlist;
Expand Down Expand Up @@ -132,6 +133,9 @@
$warn_uncovered = 1;
} else {
@jobs = @{$id2jobs{$id}}; # this dereferences the array reference.
if (@jobs > 1) {
$warn_multiply_covered = 1;
}
foreach $job_id (@jobs) {
if (!defined $job2output{$job_id}) {
die "Likely code error";
Expand Down Expand Up @@ -160,3 +164,7 @@
if ($warn_uncovered && $print_warnings) {
print STDERR "filter_scps.pl: warning: some input lines did not get output\n";
}
if ($warn_multiply_covered && $print_warnings) {
print STDERR "filter_scps.pl: warning: some input lines were output to multiple files [OK if splitting per utt] " .
join(" ", @ARGV) . "\n";
}
25 changes: 2 additions & 23 deletions egs/wsj/s5/utils/split_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,26 +16,20 @@
# limitations under the License.

split_per_spk=true
allow_uneven_split=false
if [ "$1" == "--per-utt" ]; then
split_per_spk=false
shift
fi
if [ "$1" == "--allow-uneven-split" ]; then
allow_uneven_split=true
shift
fi

if [ $# != 2 ]; then
echo "Usage: $0 [--per-utt] [--allow-uneven-split] <data-dir> <num-to-split>"
echo "Usage: $0 [--per-utt] <data-dir> <num-to-split>"
echo "E.g.: $0 data/train 50"
echo "It creates its output in e.g. data/train/split50/{1,2,3,...50}, or if the "
echo "--per-utt option was given, in e.g. data/train/split50utt/{1,2,3,...50}."
echo ""
echo "This script will not split the data-dir if it detects that the output is newer than the input."
echo "By default it splits per speaker (so each speaker is in only one split dir),"
echo "but with the --per-utt option it will ignore the speaker information while splitting."
echo "To avoid crash caused by splitting imbalanced data use --allow-uneven-split"
exit 1
fi

Expand Down Expand Up @@ -73,11 +67,6 @@ if [ -f $data/text ] && [ $nu -ne $nt ]; then
echo "** use utils/fix_data_dir.sh to fix this."
fi

ns=`cat $data/spk2utt | wc -l`
if [ $numsplit -gt $ns ] && [ $split_per_spk = "true" ]; then
echo "You should reduce the number of jobs ($numsplit) as there are not enough speakers ($ns)."
exit 1
fi

if $split_per_spk; then
utt2spk_opt="--utt2spk=$data/utt2spk"
Expand All @@ -87,11 +76,6 @@ else
utt="utt"
fi

utt2dur_opt=
if [ -f $data/utt2dur ]; then
utt2dur_opt="--utt2dur=$data/utt2dur"
fi

s1=$data/split${numsplit}${utt}/1
if [ ! -d $s1 ]; then
need_to_split=true
Expand Down Expand Up @@ -124,12 +108,7 @@ fi
which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
trap 'rm -f $data/.split_lock' EXIT HUP INT PIPE TERM

even_split_opt=""
if $allow_uneven_split; then
even_split_opt="--allow-uneven-split"
fi

utils/split_scp.pl $even_split_opt $utt2spk_opt $utt2dur_opt $data/utt2spk $utt2spks || exit 1
utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1

for n in `seq $numsplit`; do
dsn=$data/split${numsplit}${utt}/$n
Expand Down
137 changes: 4 additions & 133 deletions egs/wsj/s5/utils/split_scp.pl
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,6 @@
# this case, if there are more chunks than speakers (and in some other
# circumstances), some of the resulting chunks will be empty and it will print
# an error message and exit with nonzero status.
# With the --utt2dur (and --utt2spk) option it will try and create equal size
# chunks by duration. This can cause issues when there is a severe imbalance
# in the data (extreme example, 90% of the data is one speaker), in which case
# the script will stop with an error message. This behaviour can be overriden
# with --allow-uneven-splits.
# You will normally call this like:
# split_scp.pl scp scp.1 scp.2 scp.3 ...
# or
Expand All @@ -52,11 +47,9 @@
$num_jobs = 0;
$job_id = 0;
$utt2spk_file = "";
$utt2dur_file = "";
$one_based = 0;
$allow_uneven_split = 0;

for ($x = 1; $x <= 4 && @ARGV > 0; $x++) {
for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
if ($ARGV[0] eq "-j") {
shift @ARGV;
$num_jobs = shift @ARGV;
Expand All @@ -66,36 +59,25 @@
$utt2spk_file=$1;
shift;
}

if ($ARGV[0] =~ "--utt2dur=(.+)") {
$utt2dur_file=$1;
shift;
}

if ($ARGV[0] eq '--one-based') {
$one_based = 1;
shift @ARGV;
}
if ($ARGV[0] eq '--allow-uneven-split') {
$allow_uneven_split = 1;
shift @ARGV;
}
}

if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
$job_id - $one_based >= $num_jobs)) {
die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
($one_based ? " --one-based" : "") . "'\n"

}

$one_based
and $job_id--;

if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
die
"Usage: split_scp.pl [--allow-uneven-splits] [--utt2spk=<utt2spk_file>] [--utt2dur=<utt2dur_file>] in.scp out1.scp out2.scp ...
or: split_scp.pl -j num-jobs job-id [--allow-uneven-splits] [--one-based] [--utt2spk=<utt2spk_file>] [--utt2dur=<utt2dur_file>] in.scp [out.scp]
"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
}

Expand All @@ -113,119 +95,8 @@
}
}
}
if ($utt2spk_file ne "" && $utt2dur_file ne "" ) { # --utt2spk and --utt2dur
open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
while(<U>) {
@A = split;
@A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
($u,$s) = @A;
$utt2spk{$u} = $s;
}
$dursum = 0.0;
open(U, "<$utt2dur_file") || die "Failed to open utt2dur file $utt2dur_file";
while(<U>) {
@A = split;
@A == 2 || die "Bad line $_ in utt2spk file $utt2dur_file";
($u,$d) = @A;
$dursum += $d;
$s = $utt2spk{$u};
if (!defined $spk2dur{$s}) {
$spk2dur{$s} = 0.0;
}
$spk2dur{$s} += $d;
}
open(I, "<$inscp") || die "Opening input scp file $inscp";
@spkrs = ();
while(<I>) {
@A = split;
if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
$u = $A[0];
$s = $utt2spk{$u};
if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; }
if(!defined $spk_count{$s}) {
push @spkrs, $s;
$spk_count{$s} = 0;
$spk_data{$s} = []; # ref to new empty array.
}
if(!defined $spk2utt{$s}) {
$spk2utt{$s} = [];
}
$spk_count{$s}++;
push @{$spk_data{$s}}, $_;
push @{$spk2utt{$s}}, $u;
}

$numspks = @spkrs; # number of speakers.
$numscps = @OUTPUTS; # number of output files.
if ($numspks < $numscps) {
die "Refusing to split data because number of speakers $numspks is less " .
"than the number of output .scp files $numscps";
}
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
$scparray[$scpidx] = []; # [] is array reference.
$scp2dur[$scpidx] = 0.0;
}
$splitdur = $dursum / $numscps;
$dursum = 0.0;
$scpidx = 0;
$dursum_current = 0.0;
for my $spk (sort (keys %spk2utt)) {
$scpcount[$scpidx] += $spk_count{$spk};
push @{$scparray[$scpidx]}, $spk;
$dur = $spk2dur{$spk};
$dursum += $dur;
$dursum_current += $dur;
if ($dursum >= $splitdur * ($scpidx + 1) && $dursum_current > 10.0) {
$scp2dur[$scpidx] = $dursum_current;
$scpidx += 1;
$dursum_current = 0.0;
if ($scpidx >= $numscps) {
last;
}
}
}

$smallest_dur = $splitdur;
$largest_dur = $splitdur;
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
$scpdur = $scp2dur[$scpidx];
if ($scpdur > $largest_dur) {
$largest_dur = $scpdur;
}
if ($scpdur < $smallest_dur) {
$smallest_dur = $scpdur;
}
}

if ($allow_uneven_split != 1) {
if (($smallest_dur < $largest_dur / 2 && $largest_dur > 3600) ||
$smallest_dur == 0.0) {
die "Trying to split data while taking duration into account leads to a " .
"severe imbalance in splits. This happens when there is a lot more data " .
"for some speakers than for others.\n" .
"You should use utils/data/modify_speaker_duration.sh to fix that.\n"
}
}

# Now print out the files...
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
$scpfn = $OUTPUTS[$scpidx];
open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
$count = 0;
if(@{$scparray[$scpidx]} == 0) {
print STDERR "Error: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
$error = 1;
} else {
foreach $spk ( sort @{$scparray[$scpidx]} ) {
print F @{$spk_data{$spk}};
$count += $spk_count{$spk};
}
if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
}
close(F);
}
} elsif ($utt2spk_file ne "") { # We have the --utt2spk option...

if ($utt2spk_file ne "") { # We have the --utt2spk option...
open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
while(<$u_fh>) {
@A = split;
Expand Down

0 comments on commit f679c78

Please sign in to comment.