Skip to content

Commit

Permalink
[scripts] Replace sed commands using \t and \n for OS X compatiblity (k…
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaohui-zhang authored and danpovey committed Oct 11, 2017
1 parent 8bb27a9 commit 7ed7311
Show file tree
Hide file tree
Showing 22 changed files with 37 additions and 41 deletions.
4 changes: 2 additions & 2 deletions egs/babel/s5/local/make_lexicon_subset.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ input_lexicon_file=$2
output_lexicon_file=$3

(
#find $dev_data_dir/transcription/ -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | sed 's/ /\n/g'
find $transcriptions -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | sed 's/ /\n/g'
#find $dev_data_dir/transcription/ -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | perl -ape 's/ /\n/g;'
find $transcriptions -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | perl -ape 's/ /\n/g;'
) | sort -u | awk '
BEGIN {
while(( getline line< ARGV[2] ) > 0 ) {
Expand Down
2 changes: 1 addition & 1 deletion egs/babel/s5b/local/datasets/extra_kws.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ function setup_oov_search {
<(cat $kwlist | grep -o -P "(?<=<kwtext>).*(?=</kwtext>)" | uconv -f utf-8 -t utf-8 -x Any-Lower) \
>$kwsdatadir/keywords.txt
cut -f 2 $kwsdatadir/keywords.txt | \
sed 's/\s\s*/\n/g' | sort -u > $kwsdatadir/oov.txt
perl -ape 's/\s\s*/\n/g;' | sort -u > $kwsdatadir/oov.txt


#Generate the confusion matrix
Expand Down
4 changes: 2 additions & 2 deletions egs/babel/s5b/local/make_lexicon_subset.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ input_lexicon_file=$2
output_lexicon_file=$3

(
#find $dev_data_dir/transcription/ -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | sed 's/ /\n/g'
find $transcriptions -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | sed 's/ /\n/g'
#find $dev_data_dir/transcription/ -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | perl -ape 's/ /\n/g;'
find $transcriptions -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | perl -ape 's/ /\n/g;'
) | sort -u | awk '
BEGIN {
while(( getline line< ARGV[2] ) > 0 ) {
Expand Down
2 changes: 1 addition & 1 deletion egs/babel/s5b/local/train_g2p.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ fi
if [ ! -z $icu_transform ] ; then
paste \
<(cat $lexicon | awk '{print $1}' | uconv -f $encoding -t $encoding -x "$icu_transform") \
<(cat $lexicon | sed $'s/^[^ \t][^ \t]*[ \t]//g') \
<(cat $lexicon | perl -ape 's/^[^ \t][^ \t]*[ \t]//g;') \
> $wdir/lexicon_transformed.txt
lexicon=$wdir/lexicon_transformed.txt
fi
Expand Down
2 changes: 1 addition & 1 deletion egs/babel/s5c/local/datasets/extra_kws.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ function setup_oov_search {
<(cat $kwlist | grep -o -P "(?<=<kwtext>).*(?=</kwtext>)" | uconv -f utf-8 -t utf-8 -x Any-Lower) \
>$kwsdatadir/keywords.txt
cut -f 2 $kwsdatadir/keywords.txt | \
sed 's/\s\s*/\n/g' | sort -u > $kwsdatadir/oov.txt
perl -ape 's/\s\s*/\n/g;' | sort -u > $kwsdatadir/oov.txt


#Generate the confusion matrix
Expand Down
2 changes: 1 addition & 1 deletion egs/babel/s5c/local/generate_confusion_matrix.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ for i in `seq 1 $nj` ; do
done

echo "Converting statistics..."
cat $confusion_files | cut -f 2- -d ' ' | sed 's/ *; */\n/g'| sort | uniq -c | \
cat $confusion_files | cut -f 2- -d ' ' | perl -ape 's/ *; */\n/g;'| sort | uniq -c | \
grep -v -E '<oov>|<sss>|<vns>|SIL' | \
perl -ane '
die unless scalar @F == 3;
Expand Down
4 changes: 2 additions & 2 deletions egs/babel/s5c/local/make_lexicon_subset.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ input_lexicon_file=$2
output_lexicon_file=$3

(
#find $dev_data_dir/transcription/ -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | sed 's/ /\n/g'
find $transcriptions -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | sed 's/ /\n/g'
#find $dev_data_dir/transcription/ -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | perl -ape 's/ /\n/g;'
find $transcriptions -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | perl -ape 's/ /\n/g;'
) | sort -u | awk '
BEGIN {
while(( getline line< ARGV[2] ) > 0 ) {
Expand Down
10 changes: 5 additions & 5 deletions egs/babel/s5d/local/best_scores_kws.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ export kwsets=( `find ${mydirs[@]} -type d -name "kwset*" -not \( -ipath "*syll
done | \
while IFS='' read -r line || [[ -n "$line" ]]; do
file=$(echo $line | sed 's/:.*//g' )
cat $file | sed 's/ *, */\n/g' | sed 's/ //g' | grep -E 'TWV|THR' | paste -s | paste - <(echo $file)
cat $file | perl -ape 's/ *, */\n/g;' | sed 's/ //g' | grep -E 'TWV|THR' | paste -s | paste - <(echo $file)
done
) | column -t | sort -k3,3g | \
(
Expand All @@ -64,7 +64,7 @@ export kwsets=( `find ${mydirs[@]} -type d -name "kwset*" -not \( -ipath "*syll
done | \
while IFS='' read -r line || [[ -n "$line" ]]; do
file=$(echo $line | sed 's/:.*//g' )
cat $file | sed 's/ *, */\n/g' | sed 's/ //g' | grep -E 'TWV|THR' | paste -s | paste - <(echo $file)
cat $file | perl -ape 's/ *, */\n/g;' | sed 's/ //g' | grep -E 'TWV|THR' | paste -s | paste - <(echo $file)
done
) | column -t | sort -k3,3g | \
(
Expand Down Expand Up @@ -92,7 +92,7 @@ export kwsets=( `find ${mydirs[@]} -type d -name "kwset*" -not \( -ipath "*syll
done | \
while IFS='' read -r line || [[ -n "$line" ]]; do
file=$(echo $line | sed 's/:.*//g' )
cat $file | sed 's/ *, */\n/g' | sed 's/ //g' | grep -E 'TWV|THR' | paste -s | paste - <(echo $file)
cat $file | perl -ape 's/ *, */\n/g;' | sed 's/ //g' | grep -E 'TWV|THR' | paste -s | paste - <(echo $file)
done
) | column -t | sort -k3,3g | \
(
Expand Down Expand Up @@ -125,7 +125,7 @@ export kwsets=( `find ${mydirs[@]} -type d -name "kwset*" -not \( -ipath "*syll
done | \
while IFS='' read -r line || [[ -n "$line" ]]; do
file=$(echo $line | sed 's/:.*//g' )
cat $file | sed 's/ *, */\n/g' | sed 's/ //g' | grep -E 'TWV|THR' | paste -s | paste - <(echo $file)
cat $file | perl -ape 's/ *, */\n/g;' | sed 's/ //g' | grep -E 'TWV|THR' | paste -s | paste - <(echo $file)
done
) | column -t | sort -k3,3g | \
(
Expand Down Expand Up @@ -158,7 +158,7 @@ export kwsets=( `find ${mydirs[@]} -type d -name "kwset*" -not \( -ipath "*syll
done | \
while IFS='' read -r line || [[ -n "$line" ]]; do
file=$(echo $line | sed 's/:.*//g' )
cat $file | sed 's/ *, */\n/g' | sed 's/ //g' | grep -E 'TWV|THR' | paste -s | paste - <(echo $file)
cat $file | perl -ape 's/ *, */\n/g;' | sed 's/ //g' | grep -E 'TWV|THR' | paste -s | paste - <(echo $file)
done
) | column -t | sort -k3,3g | \
(
Expand Down
2 changes: 1 addition & 1 deletion egs/babel/s5d/local/datasets/extra_kws.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ function setup_oov_search {
<(cat $kwlist | grep -o -P "(?<=<kwtext>).*(?=</kwtext>)" | uconv -f utf-8 -t utf-8 -x Any-Lower) \
>$kwsdatadir/keywords.txt
cut -f 2 $kwsdatadir/keywords.txt | \
sed 's/\s\s*/\n/g' | sort -u > $kwsdatadir/oov.txt
perl -ape 's/\s\s*/\n/g;' | sort -u > $kwsdatadir/oov.txt


#Generate the confusion matrix
Expand Down
2 changes: 1 addition & 1 deletion egs/babel/s5d/local/generate_confusion_matrix.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ for i in `seq 1 $nj` ; do
done

echo "Converting statistics..."
cat $confusion_files | cut -f 2- -d ' ' | sed 's/ *; */\n/g' | \
cat $confusion_files | cut -f 2- -d ' ' | perl -ape 's/ *; */\n/g;' | \
sed 's/ *$//g' | sed 's/^ *//g' | sort | uniq -c | \
grep -v -E '<oov>|<sss>|<vns>|SIL' | \
perl -ane '
Expand Down
4 changes: 2 additions & 2 deletions egs/babel/s5d/local/make_lexicon_subset.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ input_lexicon_file=$2
output_lexicon_file=$3

(
#find $dev_data_dir/transcription/ -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | sed 's/ /\n/g'
find $transcriptions -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | sed 's/ /\n/g'
#find $dev_data_dir/transcription/ -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | perl -ape 's/ /\n/g;'
find $transcriptions -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | perl -ape 's/ /\n/g;'
) | sort -u | awk '
BEGIN {
while(( getline line< ARGV[2] ) > 0 ) {
Expand Down
2 changes: 1 addition & 1 deletion egs/babel/s5d/local/make_wordlist.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ transcriptions=$1
wordlist=$2

(
find $transcriptions -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | sed 's/ /\n/g'
find $transcriptions -name "*.txt" | xargs egrep -vx '\[[0-9.]+\]' |cut -f 2- -d ':' | perl -ape 's/ /\n/g;'
) | sort -u | grep -v -E '.*\*.*|<.*>|\(\(\)\)|^-.*|.*-$' > $wordlist

12 changes: 4 additions & 8 deletions egs/chime1/s5/local/chime1_prepare_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,8 @@ scp="$data/train/wav.scp"
rm -f "$scp"
for sid in `seq 34`; do
sid2=`printf "s%02d" $sid`
tab=$'\t'
ls -1 $wav_train/id$sid/*.wav \
| sed "s/\(.*\)\/\(.*\).wav/${sid2}_\2${tab}\1\/\2.wav/" \
| perl -ape "s/(.*)\/(.*).wav/${sid2}_\2\t\1\/\2.wav/;" \
| sort >> $scp
done
for x in "devel" "test"; do
Expand All @@ -53,9 +52,8 @@ for x in "devel" "test"; do
wav_dir="${!wav_var}"
for sid in `seq 34`; do
sid2=`printf "s%02d" $sid`
tab=$'\t'
ls -1 $wav_dir/*/s${sid}_*.wav \
| sed "s/\(.*\)\/\(.*\)\/s.*_\(.*\).wav/${sid2}_\3_\2${tab}\1\/\2\/s${sid}_\3.wav/" \
| perl -ape "s/(.*)\/(.*)\/s.*_(.*).wav/${sid2}_\3_\2$\t\1\/\2\/s${sid}_\3.wav/;" \
| sort >> $scp
done
fi
Expand All @@ -70,11 +68,9 @@ for x in $set_list; do

# Create utt2spk files
# No speaker ID
tab=$'\t'
sed "s/\(.*\)${tab}.*/\1${tab}\1/" < "$scp" > "$data/$x/utt2spk"
perl -ape "s/(.*)\t.*/\1$\t\1/;" < "$scp" > "$data/$x/utt2spk"
# Use speaker ID
# tab=$'\t'
# sed "s/\(s..\)\(.*\)\${tab}.*/\1\2${tab}\1/" < "$scp" > "$data/$x/utt2spk"
# perl -ape "s/(s..)(.*)\\t.*/\1\2\t\1/;" < "$scp" > "$data/$x/utt2spk"

# Create spk2utt files
cat "$data/$x/utt2spk" | $utils/utt2spk_to_spk2utt.pl > "$data/$x/spk2utt" || exit 1;
Expand Down
2 changes: 1 addition & 1 deletion egs/csj/s5/local/csj_make_trans/csj_autorun.sh
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ if [ ! -e $outd/.done_make_lexicon ]; then
sort -u $lexicon/lexicon.txt > $lexicon/lexicon_htk.txt
local/csj_make_trans/vocab2dic.pl -p local/csj_make_trans/kana2phone -e $lexicon/ERROR_v2d -o $lexicon/lexicon.txt $lexicon/lexicon_htk.txt
cut -d'+' -f1,3- $lexicon/lexicon.txt >$lexicon/lexicon_htk.txt
cut -f1,3- $lexicon/lexicon_htk.txt | sed $'s:[\t]: :g' >$lexicon/lexicon.txt
cut -f1,3- $lexicon/lexicon_htk.txt | perl -ape 's:\t: :g' >$lexicon/lexicon.txt

if [ -s $lexicon/lexicon.txt ] ;then
echo -n >$outd/.done_make_lexicon
Expand Down
2 changes: 1 addition & 1 deletion egs/farsdat/s5/local/farsdat_data_prep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ sed -e 's:.*/PH\([1-2]\)\(.*\)\.\(.*\)$:\2 \1 \3:i' $tmpdir/phn.flist |\

while read line; do
[ -f $line ] || error_exit "Cannot find transcription file '$line'";
cut -c1 "$line" | tr '\n' ' ' | sed -e 's: *$:\n:' || exit 1;
cut -c1 "$line" | tr '\n' ' ' | perl -ape 's: *$:\n:;' || exit 1;
done < $tmpdir/phn.flist > $tmpdir/phn.trans || exit 1;

paste $tmpdir/phn.uttids $tmpdir/phn.trans | sort -k1,1 > $dir/trans || exit 1;
Expand Down
6 changes: 3 additions & 3 deletions egs/gale_mandarin/s5/local/gale_prep_dict.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ esac

# extract full vocabulary
cat $train_dir/text $dev_dir/text | awk '{for (i = 2; i <= NF; i++) print $i}' |\
sed -e 's/ /\n/g' | sort -u | \
perl -ape 's/ /\n/g;' | sort -u | \
grep -v '\[LAUGHTER\]' | \
grep -v '\[NOISE\]' |\
grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/vocab-full.txt
Expand Down Expand Up @@ -139,8 +139,8 @@ cat $dict_dir/ch-dict.txt |\
}
' > $dict_dir/ch-dict-1.txt

cat $dict_dir/ch-dict-1.txt | awk '{print $1}' | sed -e 's/\(\S\)/\1\n/g' | grep -v '^$' > $dict_dir/ch-char.txt
cat $dict_dir/ch-dict-1.txt | awk '{for(i=2; i<=NF; i++) print $i}' | sed -e 's/ /\n/g' > $dict_dir/ch-char-pinyin.txt
cat $dict_dir/ch-dict-1.txt | awk '{print $1}' | perl -ape 's/(\S)/\1\n/g;' | grep -v '^$' > $dict_dir/ch-char.txt
cat $dict_dir/ch-dict-1.txt | awk '{for(i=2; i<=NF; i++) print $i}' | perl -ape 's/ /\n/g;' > $dict_dir/ch-char-pinyin.txt
wc -l $dict_dir/ch-char.txt
wc -l $dict_dir/ch-char-pinyin.txt
paste $dict_dir/ch-char.txt $dict_dir/ch-char-pinyin.txt | sort -u > $dict_dir/ch-char-dict.txt
Expand Down
4 changes: 2 additions & 2 deletions egs/hkust/s5/local/hkust_prepare_dict.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ mkdir -p $dict_dir/lexicon-{en,ch}

# extract full vocabulary
cat $train_dir/text $dev_dir/text | awk '{for (i = 2; i <= NF; i++) print $i}' |\
sed -e 's/ /\n/g' | sort -u | grep -v '\[LAUGHTER\]' | grep -v '\[NOISE\]' |\
perl -ape 's/ /\n/g;' | sort -u | grep -v '\[LAUGHTER\]' | grep -v '\[NOISE\]' |\
grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/words.txt || exit 1;

# split into English and Chinese
Expand Down Expand Up @@ -201,7 +201,7 @@ cat $dict_dir/cedict/ch-dict-1.txt | awk '{print $1}' |\
# extract individual pinyins
cat $dict_dir/cedict/ch-dict-1.txt |\
awk '{for(i=2; i<=NF; i++) print $i}' |\
sed -e 's/ /\n/g' > $dict_dir/lexicon-ch/ch-char-pinyin.txt || exit 1;
perl -ape 's/ /\n/g;' > $dict_dir/lexicon-ch/ch-char-pinyin.txt || exit 1;

# first make sure number of characters and pinyins
# are equal, so that a char-based dictionary can
Expand Down
2 changes: 1 addition & 1 deletion egs/hub4_spanish/s5/local/prepare_lexicon.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ mv $text ${text}.orig
mv ${text}.clean $text
utils/fix_data_dir.sh `dirname $text`

cut -f 2- -d ' ' $text | sed 's/ /\n/g' | sort -u > $out/word_list.raw
cut -f 2- -d ' ' $text | perl -ape 's/ /\n/g;' | sort -u > $out/word_list.raw
(echo SIL; grep "<" $out/word_list.raw) | awk '{print $0, $0;}' > $out/silence_lexicon.txt
grep -v "<" $out/word_list.raw > $out/word_list.txt

Expand Down
4 changes: 2 additions & 2 deletions egs/multi_en/s5/local/g2p/apply_g2p.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ echo 'Gathering missing words...'
cat data/*/train/text | \
local/count_oovs.pl $lexicon | \
awk '{for(i=4; i<NF; i++) printf "%s",$i OFS; if(NF) printf "%s",$NF; printf ORS}' | \
sed 's/\s/\n/g' | \
perl -ape 's/\s/\n/g;' | \
grep -v 0 | sort | uniq | \
grep '^[a-z]*$' > $workdir/missing.txt
sort | uniq > $workdir/missing.txt

echo 'Synthesizing pronunciations for missing words...'
PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \
Expand Down
2 changes: 1 addition & 1 deletion egs/reverb/s5/local/REVERB_create_mcdata.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ fi

# Download and install nist tools
pushd $dir/ReleasePackage/reverb_tools_for_asr_ver2.0
sed -e "s|^main$|targetSPHEREDir\=tools/SPHERE\ninstall_nist|" installTools > installnist
perl -ape "s|^main$|targetSPHEREDir\=tools/SPHERE\ninstall_nist|;" installTools > installnist
chmod u+x installnist
./installnist
popd
Expand Down
2 changes: 1 addition & 1 deletion egs/timit/s5/local/timit_data_prep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ for x in train dev test; do
> $tmpdir/${x}_phn.uttids
while read line; do
[ -f $line ] || error_exit "Cannot find transcription file '$line'";
cut -f3 -d' ' "$line" | tr '\n' ' ' | sed -e 's: *$:\n:'
cut -f3 -d' ' "$line" | tr '\n' ' ' | perl -ape 's: *$:\n:;'
done < $tmpdir/${x}_phn.flist > $tmpdir/${x}_phn.trans
paste $tmpdir/${x}_phn.uttids $tmpdir/${x}_phn.trans \
| sort -k1,1 > ${x}.trans
Expand Down
2 changes: 1 addition & 1 deletion egs/wsj/s5/steps/get_prons.sh
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ if [ $stage -le 5 ]; then
# 2. Collect bigram counts for words. To be more specific, we are actually
# collecting counts for "v ? w", where "?" represents silence or
# non-silence.
cat $dir/pron_perutt_nowb.txt | sed $'s/<eps>[^\t]*\t//g' | perl -e '
cat $dir/pron_perutt_nowb.txt | perl -ape 's/<eps>[^\t]*\t//g;' | perl -e '
while (<>) {
chomp; @col = split("\t");
for($i = 1; $i < scalar(@col) - 1; $i += 1) {
Expand Down

0 comments on commit 7ed7311

Please sign in to comment.