Skip to content

Commit

Permalink
[egs] Madcat arabic: easier configuration of data locations; fix wer_…
Browse files Browse the repository at this point in the history
…filter. (kaldi-asr#2440)
  • Loading branch information
aarora8 authored and danpovey committed May 22, 2018
1 parent b1be44e commit 1ab3df0
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 71 deletions.
26 changes: 9 additions & 17 deletions egs/madcat_ar/v1/local/create_line_image_from_page_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@
help='Path to file that contains the train/test/dev split information')
parser.add_argument('out_dir', type=str,
help='directory location to write output files')
parser.add_argument('writing_condition1', type=str,
help='Path to the downloaded (and extracted) writing conditions file 1')
parser.add_argument('writing_condition2', type=str,
help='Path to the downloaded (and extracted) writing conditions file 2')
parser.add_argument('writing_condition3', type=str,
help='Path to the downloaded (and extracted) writing conditions file 3')
parser.add_argument('--padding', type=int, default=400,
help='padding across horizontal/verticle direction')
args = parser.parse_args()
Expand Down Expand Up @@ -541,23 +547,9 @@ def check_writing_condition(wc_dict, base_name):

def main():

writing_condition_folder_list = args.database_path1.split('/')
writing_condition_folder1 = ('/').join(writing_condition_folder_list[:5])

writing_condition_folder_list = args.database_path2.split('/')
writing_condition_folder2 = ('/').join(writing_condition_folder_list[:5])

writing_condition_folder_list = args.database_path3.split('/')
writing_condition_folder3 = ('/').join(writing_condition_folder_list[:5])

writing_conditions1 = os.path.join(writing_condition_folder1, 'docs', 'writing_conditions.tab')
writing_conditions2 = os.path.join(writing_condition_folder2, 'docs', 'writing_conditions.tab')
writing_conditions3 = os.path.join(writing_condition_folder3, 'docs', 'writing_conditions.tab')

wc_dict1 = parse_writing_conditions(writing_conditions1)
wc_dict2 = parse_writing_conditions(writing_conditions2)
wc_dict3 = parse_writing_conditions(writing_conditions3)

wc_dict1 = parse_writing_conditions(args.writing_condition1)
wc_dict2 = parse_writing_conditions(args.writing_condition2)
wc_dict3 = parse_writing_conditions(args.writing_condition3)
output_directory = args.out_dir
image_file = os.path.join(output_directory, 'images.scp')
image_fh = open(image_file, 'w', encoding='utf-8')
Expand Down
12 changes: 8 additions & 4 deletions egs/madcat_ar/v1/local/extract_lines.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@ cmd=run.pl
download_dir1=/export/corpora/LDC/LDC2012T15/data
download_dir2=/export/corpora/LDC/LDC2013T09/data
download_dir3=/export/corpora/LDC/LDC2013T15/data
dataset_file=data/download/data_splits/madcat.dev.raw.lineid
writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
data_split_file=data/download/data_splits/madcat.dev.raw.lineid
data=data/local/dev
echo "$0 $@"

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh || exit 1;

data=$1
log_dir=$data/log

mkdir -p $log_dir
Expand All @@ -23,14 +26,15 @@ for n in $(seq $nj); do
split_scps="$split_scps $log_dir/lines.$n.scp"
done

utils/split_scp.pl $dataset_file $split_scps || exit 1;
utils/split_scp.pl $data_split_file $split_scps || exit 1;

for n in $(seq $nj); do
mkdir -p $data/$n
done

$cmd JOB=1:$nj $log_dir/extract_lines.JOB.log \
local/create_line_image_from_page_image.py $download_dir1 $download_dir2 $download_dir3 $log_dir/lines.JOB.scp $data/JOB \
local/create_line_image_from_page_image.py $download_dir1 $download_dir2 $download_dir3 \
$log_dir/lines.JOB.scp $data/JOB $writing_condition1 $writing_condition2 $writing_condition3 \
|| exit 1;

## concatenate the .scp files together.
Expand Down
20 changes: 16 additions & 4 deletions egs/madcat_ar/v1/local/prepare_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@ stage=0
download_dir1=/export/corpora/LDC/LDC2012T15/data
download_dir2=/export/corpora/LDC/LDC2013T09/data
download_dir3=/export/corpora/LDC/LDC2013T15/data
data_splits=data/download/data_splits
writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
data_splits_dir=data/download/data_splits
images_scp_dir=data/local

. ./cmd.sh
. ./path.sh
Expand All @@ -29,9 +33,17 @@ mkdir -p data/{train,test,dev}
if [ $stage -le 1 ]; then
echo "$0: Processing dev, train and test data..."
echo "Date: $(date)."
local/process_data.py $download_dir1 $download_dir2 $download_dir3 $data_splits/madcat.dev.raw.lineid data/dev data/local/dev/images.scp || exit 1
local/process_data.py $download_dir1 $download_dir2 $download_dir3 $data_splits/madcat.test.raw.lineid data/test data/local/test/images.scp || exit 1
local/process_data.py $download_dir1 $download_dir2 $download_dir3 $data_splits/madcat.train.raw.lineid data/train data/local/train/images.scp || exit 1
local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
$data_splits_dir/madcat.dev.raw.lineid data/dev $images_scp_dir/dev/images.scp \
$writing_condition1 $writing_condition2 $writing_condition3 || exit 1

local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
$data_splits_dir/madcat.test.raw.lineid data/test $images_scp_dir/test/images.scp \
$writing_condition1 $writing_condition2 $writing_condition3 || exit 1

local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
$data_splits_dir/madcat.train.raw.lineid data/train $images_scp_dir/train/images.scp \
$writing_condition1 $writing_condition2 $writing_condition3 || exit 1

for dataset in dev test train; do
echo "$0: Fixing data directory for dataset: $dataset"
Expand Down
29 changes: 9 additions & 20 deletions egs/madcat_ar/v1/local/process_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@
help='directory location to write output files.')
parser.add_argument('images_scp_path', type=str,
help='Path of input images.scp file(maps line image and location)')
parser.add_argument('writing_condition1', type=str,
help='Path to the downloaded (and extracted) writing conditions file 1')
parser.add_argument('writing_condition2', type=str,
help='Path to the downloaded (and extracted) writing conditions file 2')
parser.add_argument('writing_condition3', type=str,
help='Path to the downloaded (and extracted) writing conditions file 3')
args = parser.parse_args()


Expand Down Expand Up @@ -164,29 +170,12 @@ def get_line_image_location():
image_file = os.path.join(args.out_dir, 'images.scp')
image_fh = open(image_file, 'w', encoding='utf-8')

data_path1 = args.database_path1
data_path2 = args.database_path2
data_path3 = args.database_path3

input_image_file = args.images_scp_path
input_image_fh = open(input_image_file, 'r', encoding='utf-8')

writing_condition_folder_list = args.database_path1.split('/')
writing_condition_folder1 = ('/').join(writing_condition_folder_list[:5])

writing_condition_folder_list = args.database_path2.split('/')
writing_condition_folder2 = ('/').join(writing_condition_folder_list[:5])

writing_condition_folder_list = args.database_path3.split('/')
writing_condition_folder3 = ('/').join(writing_condition_folder_list[:5])

writing_conditions1 = os.path.join(writing_condition_folder1, 'docs', 'writing_conditions.tab')
writing_conditions2 = os.path.join(writing_condition_folder2, 'docs', 'writing_conditions.tab')
writing_conditions3 = os.path.join(writing_condition_folder3, 'docs', 'writing_conditions.tab')

wc_dict1 = parse_writing_conditions(writing_conditions1)
wc_dict2 = parse_writing_conditions(writing_conditions2)
wc_dict3 = parse_writing_conditions(writing_conditions3)
wc_dict1 = parse_writing_conditions(args.writing_condition1)
wc_dict2 = parse_writing_conditions(args.writing_condition2)
wc_dict3 = parse_writing_conditions(args.writing_condition3)
image_loc_dict = get_line_image_location()

image_num = 0
Expand Down
18 changes: 9 additions & 9 deletions egs/madcat_ar/v1/local/wer_output_filter
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,15 @@ while (<>) {
$s =~ s/\x{0649}/\x{064A}/g;
$s =~ s/\x{0629}/\x{0647}/g;
$s =~ s/\x{0660}/0/g;
$s =~ s/\x{0660}/1/g;
$s =~ s/\x{0660}/2/g;
$s =~ s/\x{0660}/3/g;
$s =~ s/\x{0660}/4/g;
$s =~ s/\x{0660}/5/g;
$s =~ s/\x{0660}/6/g;
$s =~ s/\x{0660}/7/g;
$s =~ s/\x{0660}/8/g;
$s =~ s/\x{0660}/9/g;
$s =~ s/\x{0661}/1/g;
$s =~ s/\x{0662}/2/g;
$s =~ s/\x{0663}/3/g;
$s =~ s/\x{0664}/4/g;
$s =~ s/\x{0665}/5/g;
$s =~ s/\x{0666}/6/g;
$s =~ s/\x{0667}/7/g;
$s =~ s/\x{0668}/8/g;
$s =~ s/\x{0669}/9/g;
$s =~ s/\x{0621}//g;
$s =~ s/[\x{064b}-\x{0655}]//g;
$s =~ s/\x{0640}//g;
Expand Down
21 changes: 15 additions & 6 deletions egs/madcat_ar/v1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ decode_gmm=false
download_dir1=/export/corpora/LDC/LDC2012T15/data
download_dir2=/export/corpora/LDC/LDC2013T09/data
download_dir3=/export/corpora/LDC/LDC2013T15/data
writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
data_splits_dir=data/download/data_splits

. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
Expand All @@ -33,21 +36,27 @@ mkdir -p data/local/{train,test,dev}
if [ $stage -le 0 ]; then
echo "$0: Downloading data splits..."
echo "Date: $(date)."
local/download_data.sh --data_splits $data_splits_dir
local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
--download_dir2 $download_dir2 --download_dir3 $download_dir3
fi

if [ $stage -le 1 ]; then
for dataset in test train dev; do
dataset_file=$data_splits_dir/madcat.$dataset.raw.lineid
local/extract_lines.sh --nj $nj --cmd $cmd --dataset_file $dataset_file \
--download_dir1 $download_dir1 --download_dir2 $download_dir2 \
--download_dir3 $download_dir3 data/local/$dataset
data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid
local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \
--download_dir1 $download_dir1 --download_dir2 $download_dir2 \
--download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \
--writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \
--data data/local/$dataset
done
fi

if [ $stage -le 2 ]; then
echo "$0: Preparing data..."
local/prepare_data.sh
local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \
--download_dir3 $download_dir3 --images_scp_dir data/local \
--data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \
--writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3
fi

mkdir -p data/{train,test,dev}/data
Expand Down
28 changes: 17 additions & 11 deletions egs/madcat_ar/v1/run_end2end.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ nj=70
download_dir1=/export/corpora/LDC/LDC2012T15/data
download_dir2=/export/corpora/LDC/LDC2013T09/data
download_dir3=/export/corpora/LDC/LDC2013T15/data
writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
data_splits_dir=data/download/data_splits

. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
Expand All @@ -28,24 +31,27 @@ mkdir -p data/local/{train,test,dev}
if [ $stage -le 0 ]; then
echo "$0: Downloading data splits..."
echo "Date: $(date)."
local/download_data.sh --data_splits $data_splits_dir
local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
--download_dir2 $download_dir2 --download_dir3 $download_dir3
fi

if [ $stage -le 1 ]; then
for dataset in test dev train; do
echo "$0: Extracting line images from page image for dataset: $dataset. "
echo "Date: $(date)."
dataset_file=$data_splits_dir/madcat.$dataset.raw.lineid
local/extract_lines.sh --nj $nj --cmd $cmd --dataset_file $dataset_file \
--download_dir1 $download_dir1 --download_dir2 $download_dir2 \
--download_dir3 $download_dir3 data/local/$dataset
for dataset in test train dev; do
data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid
local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \
--download_dir1 $download_dir1 --download_dir2 $download_dir2 \
--download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \
--writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \
--data data/local/$dataset
done
fi

if [ $stage -le 2 ]; then
echo "$0: Preparing dev train and eval data..."
echo "Date: $(date)."
local/prepare_data.sh
echo "$0: Preparing data..."
local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \
--download_dir3 $download_dir3 --images_scp_dir data/local \
--data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \
--writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3
fi

if [ $stage -le 3 ]; then
Expand Down

0 comments on commit 1ab3df0

Please sign in to comment.