CSTR-Edinburgh · b-schnell · Jul 4, 2017 · Jul 4, 2017 · Jul 14, 2017 · Jul 14, 2017
diff --git a/egs/build_your_own_voice/s1/README.md b/egs/build_your_own_voice/s1/README.md
@@ -1,48 +1,63 @@
-Download Merlin
----------------
+# Build your own voice
 
-git clone https://github.com/CSTR-Edinburgh/merlin.git
+To build your own voice, `cd egs/build_your_own_voice/s1` and follow the below steps:
 
-Setup
------
+## Setting up
 
-To setup voice: 
+The first step is to run setup as it creates directories and some text files for testing.
 
-./01_setup.sh give_a_voice_name
+The next steps demonstrate on how to setup voice. 
 
-Prepare Data
-------------
+```sh
+./01_setup.sh my_voice
+```
 
-To derive labels, use alignment scripts provided below: <br/>
-a) state_align - https://github.com/CSTR-Edinburgh/merlin/tree/master/misc/scripts/alignment/state_align <br/>
-b) phone_align - https://github.com/CSTR-Edinburgh/merlin/tree/master/misc/scripts/alignment/phone_align
+It also creates a global config file: `conf/global_settings.cfg`, where default settings are stored.
+You need to modify these params as per your own data.
 
-Then, chose the vocoder: <br/>
-a) STRAIGHT - extracts 60-dim MGC, 25-dim BAP, 1-dim LF0 <br/>
-b) WORLD    - extracts 60-dim MGC, variable-dim BAP, 1-dim LF0 <br/>
-            - BAP dim (1 for 16Khz, 5 for 48Khz)  <br/>
-c) WORLD_v2 - extracts 60-dim MGC, 5-dim BAP, 1-dim LF0 <br/>
+## Prepare labels
 
-To derive acousitc features, use vocoder scripts provided below: <br/>
-a) STRAIGHT - https://github.com/CSTR-Edinburgh/merlin/blob/master/misc/scripts/vocoder/straight/extract_features_for_merlin.sh <br/>
-b) WORLD    - https://github.com/CSTR-Edinburgh/merlin/blob/master/misc/scripts/vocoder/world/extract_features_for_merlin.sh <br/>
-c) WORLD_v2 - https://github.com/CSTR-Edinburgh/merlin/blob/master/misc/scripts/vocoder/world_v2/extract_features_for_merlin.sh <br/>
+To prepare labels
+```sh
+./02_prepare_labels.sh <path_to_wav_dir> <path_to_text_dir> <path_to_labels_dir>
+```
 
-Run below script for instructions:
-./02_prepare_data.sh
+## Prepare acoustic features
+
+To prepare acoustic features
+```sh
+./03_prepare_acoustic_features.sh <path_to_wav_dir> <path_to_feat_dir>
+```
 
-Run Merlin
-----------
+## Prepare config files
 
-Once after setup, use below script to create acoustic, duration models and perform final test synthesis:
+At this point, we have to prepare two config files to train DNN models
+- Acoustic Model
+- Duration Model
 
-./03_run_merlin.sh
+To prepare config files:
+```sh
+./04_prepare_conf_files.sh conf/global_settings.cfg
+```
+Four config files will be generated: two for training, and two for testing. 
 
+## Train duration model
 
-Generate new sentences
-----------------------
+To train duration model:
+```sh
+./05_train_duration_model.sh <path_to_duration_conf_file>
+```
 
-To generate new sentences, please follow [steps] (https://github.com/CSTR-Edinburgh/merlin/issues/28) in below script:
+## Train acoustic model
 
-./04_merlin_synthesis.sh
+To train acoustic model:
+```sh
+./06_train_acoustic_model.sh <path_to_acoustic_conf_file>
+```
+## Synthesize speech
+
+To synthesize speech:
+```sh
+./07_run_merlin.sh <path_to_text_dir> <path_to_test_dur_conf_file> <path_to_test_synth_conf_file>
+```
 
diff --git a/egs/roger_blizzard2008/README b/egs/roger_blizzard2008/README
@@ -0,0 +1,12 @@
+About the roger blizzard2008 corpus
+
+The roger database was constructed at the Centre for Speech Technology Research at the University of Edinburgh for the Blizzard Challange 2008. The database was only available to registered participants in the challange (see http://www.cstr.ed.ac.uk/projects/roger_blizzard2008/).
+
+It contains 9609 utterances (~15h) from one english speaker (roger) at 16 kHz
+
+Each subdirectory of this directory contains the
+scripts for a sequence of experiments.
+
+  s1: To run roger_demo or roger_full with WORLD vocoder
+
+
diff --git a/egs/roger_blizzard2008/s1/01_setup.sh b/egs/roger_blizzard2008/s1/01_setup.sh
@@ -0,0 +1,169 @@
+#!/bin/bash
+
+if test "$#" -ne 1; then
+    echo "################################"
+    echo "Usage:"
+    echo "./01_setup.sh <voice_name>"
+    echo ""
+    echo "Give a voice name: roger_demo or roger_full"
+    echo "   Demo uses theherald1 (281 utterances, 42.8 minutes)"
+    echo "   Full uses carroll, arcitc and theherald1-3 (4871 utterances, ~8h)"
+    echo "################################"
+    exit 1
+fi
+
+if [ ! -d "${ROGER_DB}" ]; then
+    echo "ERROR: Variable ROGER_DB must be set to the roger database."
+    echo "       Use: export ROGER_DB=path/to/db/"
+    exit 1
+fi
+
+### Step 1: setup directories and the training data files ###
+echo "Step 1:"
+
+current_working_dir=$(pwd)
+merlin_dir=$(dirname $(dirname $(dirname $current_working_dir)))
+experiments_dir=${current_working_dir}/experiments
+data_dir=${current_working_dir}/database
+
+voice_name=$1
+voice_dir=${experiments_dir}/${voice_name}
+
+acoustic_dir=${voice_dir}/acoustic_model
+duration_dir=${voice_dir}/duration_model
+synthesis_dir=${voice_dir}/test_synthesis
+
+mkdir -p ${data_dir}
+mkdir -p ${experiments_dir}
+mkdir -p ${voice_dir}
+mkdir -p ${acoustic_dir}
+mkdir -p ${duration_dir}
+mkdir -p ${synthesis_dir}
+mkdir -p ${acoustic_dir}/data
+mkdir -p ${duration_dir}/data
+mkdir -p ${synthesis_dir}/txt
+
+
+audio_dir=database/wav
+txt_dir=database/txt
+label_dir=database/labels
+
+# Select the utterance list(s) to be used for training.
+if [[ "$voice_name" == *"demo"* ]]
+then
+    # The demo version only uses theherald1 (281 utterances, 42.8 minutes)
+    uttLists=("theherald1")
+elif [[ "$voice_name" == *"full"* ]]
+then
+    # The full version uses all utterance lists with meaningful utterances.
+    # Using: carroll, arcitc, theherald1-3 (4871 utterances, ~8h).
+    uttLists=("carroll" "arctic" "theherald") # Can be any of carroll, unilex, address, spelling, arcitc, emphasis, theherald, theherald1, theherald2, theherald3, all_new, total.
+else
+    echo "Undefined voice name ($voice_name)...please use roger_demo or roger_full !!"
+    exit 1
+fi
+
+# Collect utterance ids of necessary audio files.
+utts=()
+for uttList in "${uttLists[@]}"; do
+    mapfile -t -O ${#utts[@]} utts < $ROGER_DB/stp/$uttList # -t remove trailing newline, -O start index to add entries.
+done
+# Remove duplicates.
+utts=($(printf "%s\n" "${utts[@]}" | sort -u))
+
+# Audios have to be removed because utterance list selection could have been changed.
+rm -rf $audio_dir
+# Leave this check for fast testing, when $audio_dir does not have to be removed.
+if [ ! -e $audio_dir ]; then
+    mkdir -p $audio_dir
+    # Collect necessary audio files.
+    for utt in "${utts[@]}"; do
+        # cp $ROGER_DB/wav/${utt:0:7}/${utt}.wav $audio_dir/${utt}.wav
+        ln -sf $ROGER_DB/wav/${utt:0:7}/${utt}.wav $audio_dir/${utt}.wav
+    done
+fi
+
+# Labels have to be removed because utterance list selection could have been changed.
+rm -rf $txt_dir
+# Leave this check for fast testing, when $txt_dir does not have to be removed.
+if [ ! -e $txt_dir ]; then
+    mkdir -p $txt_dir
+    # The utts.data file contains all labels.
+    cp ${ROGER_DB}/utts.data ${txt_dir}/utts.data
+    # Combine the selected utterances to a regex pattern.
+    utts_pat=$(echo ${utts[@]}|tr " " "|")
+    # Select those labes of utts.data which belong to the selected utterances.
+    cat ${txt_dir}/utts.data | grep -wE "${utts_pat}" >| ${txt_dir}/utts_selected.data
+    # Turn every line of utts.data into a txt file using the utterance id as file name.
+    awk -F' ' -v outDir=${txt_dir} '{print substr($0,length($1)+2,length($0)) > outDir"/"substr($1,2,length($1)-1)".txt"}' ${txt_dir}/utts_selected.data
+    # Remove unnecessary files.
+    rm ${txt_dir}/utts.data
+    rm ${txt_dir}/utts_selected.data
+fi
+
+# Clear the labels directory.
+rm -rf $label_dir
+
+### create some test files ###
+echo "Hello world." > ${synthesis_dir}/txt/test_001.txt
+echo "Hi, this is a demo voice from Merlin." > ${synthesis_dir}/txt/test_002.txt
+echo "Hope you guys enjoy free open-source voices from Merlin." > ${synthesis_dir}/txt/test_003.txt
+printf "test_001\ntest_002\ntest_003" > ${synthesis_dir}/test_id_list.scp
+
+global_config_file=conf/global_settings.cfg
+
+### default settings ###
+echo "######################################" > $global_config_file
+echo "############# PATHS ##################" >> $global_config_file
+echo "######################################" >> $global_config_file
+echo "" >> $global_config_file
+
+echo "MerlinDir=${merlin_dir}" >>  $global_config_file
+echo "WorkDir=${current_working_dir}" >>  $global_config_file
+echo "" >> $global_config_file
+
+echo "######################################" >> $global_config_file
+echo "############# PARAMS #################" >> $global_config_file
+echo "######################################" >> $global_config_file
+echo "" >> $global_config_file
+
+echo "Voice=${voice_name}" >> $global_config_file
+echo "Labels=state_align" >> $global_config_file
+echo "QuestionFile=questions-radio_dnn_416.hed" >> $global_config_file
+echo "Vocoder=WORLD" >> $global_config_file
+echo "SamplingFreq=16000" >> $global_config_file
+echo "SilencePhone='sil'" >> $global_config_file
+echo "FileIDList=file_id_list.scp" >> $global_config_file
+echo "" >> $global_config_file
+
+echo "######################################" >> $global_config_file
+echo "######### No. of files ###############" >> $global_config_file
+echo "######################################" >> $global_config_file
+echo "" >> $global_config_file
+
+# Automatically select 5% of the data for validation and test set.
+num_files=$(ls -1 $audio_dir | wc -l)
+num_dev_set=$(awk "BEGIN { pc=${num_files}*0.05; print(int(pc)) }")
+num_train_set=$(($num_files-2*$num_dev_set))
+echo "Train=$num_train_set" >> $global_config_file 
+echo "Valid=$num_dev_set" >> $global_config_file 
+echo "Test=$num_dev_set" >> $global_config_file 
+echo "" >> $global_config_file
+
+echo "######################################" >> $global_config_file
+echo "############# TOOLS ##################" >> $global_config_file
+echo "######################################" >> $global_config_file
+echo "" >> $global_config_file
+
+echo "ESTDIR=${merlin_dir}/tools/speech_tools" >> $global_config_file
+echo "FESTDIR=${merlin_dir}/tools/festival" >> $global_config_file
+echo "FESTVOXDIR=${merlin_dir}/tools/festvox" >> $global_config_file
+echo "" >> $global_config_file
+echo "HTKDIR=${merlin_dir}/tools/bin/htk" >> $global_config_file
+echo "" >> $global_config_file
+
+echo "Merlin default voice settings configured in \"$global_config_file\""
+echo "Modify these params as per your data..."
+echo "eg., sampling frequency, no. of train files etc.,"
+echo "setup done...!"
+
diff --git a/egs/roger_blizzard2008/s1/02_prepare_labels.sh b/egs/roger_blizzard2008/s1/02_prepare_labels.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+global_config_file=conf/global_settings.cfg
+source $global_config_file
+
+if test "$#" -ne 3; then
+    echo "################################"
+    echo "Usage:"
+    echo "./02_prepare_labels.sh <path_to_wav_dir> <path_to_text_dir> <path_to_labels_dir>"
+    echo ""
+    echo "default path to wav dir(Input): database/wav"
+    echo "default path to txt dir(Input): database/txt"
+    echo "default path to lab dir(Output): database/labels"
+    echo "################################"
+    exit 1
+fi
+
+wav_dir=$1
+inp_txt=$2
+lab_dir=$3
+
+####################################
+########## Prepare labels ##########
+####################################
+
+prepare_labels=true
+copy=true
+
+if [ "$prepare_labels" = true ]; then
+    echo "Step 2: "
+    echo "Preparing labels..."
+
+    if [ "$Labels" == "state_align" ]
+    then
+        ./scripts/run_state_aligner.sh $wav_dir $inp_txt $lab_dir $global_config_file 
+    elif [ "$Labels" == "phone_align" ]
+    then
+        ./scripts/run_phone_aligner.sh $wav_dir $inp_txt $lab_dir $global_config_file 
+    else
+        echo "These labels ($Labels) are not supported as of now...please use state_align or phone_align!!"
+    fi
+fi
+
+if [ "$copy" = true ]; then
+    echo "Copying labels to duration and acoustic data directories..."
+
+    duration_data_dir=experiments/${Voice}/duration_model/data
+    acoustic_data_dir=experiments/${Voice}/acoustic_model/data
+
+    cp -r $lab_dir/label_$Labels $duration_data_dir 
+    cp -r $lab_dir/label_$Labels $acoustic_data_dir
+
+    ls $lab_dir/label_$Labels > $duration_data_dir/$FileIDList
+    ls $lab_dir/label_$Labels > $acoustic_data_dir/$FileIDList
+
+    sed -i 's/\.lab//g' $duration_data_dir/$FileIDList
+    sed -i 's/\.lab//g' $acoustic_data_dir/$FileIDList
+
+    echo "done...!"
+fi
diff --git a/egs/roger_blizzard2008/s1/03_prepare_acoustic_features.sh b/egs/roger_blizzard2008/s1/03_prepare_acoustic_features.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+global_config_file=conf/global_settings.cfg
+source $global_config_file
+
+if test "$#" -ne 2; then
+    echo "################################"
+    echo "Usage:"
+    echo "./03_prepare_acoustic_features.sh <path_to_wav_dir> <path_to_feat_dir>"
+    echo ""
+    echo "default path to wav dir(Input): database/wav"
+    echo "default path to feat dir(Output): database/feats"
+    echo "################################"
+    exit 1
+fi
+
+wav_dir=$1
+feat_dir=$2
+
+if [ ! "$(ls -A ${wav_dir})" ]; then
+    echo "Please place your audio files in: ${wav_dir}"
+    exit 1
+fi
+
+####################################
+##### prepare vocoder features #####
+####################################
+
+prepare_feats=true
+copy=true
+
+if [ "$prepare_feats" = true ]; then
+    echo "Step 3:" 
+    echo "Prepare acoustic features using WORLD vocoder..."
+    python ${MerlinDir}/misc/scripts/vocoder/world/extract_features_for_merlin.py ${MerlinDir} ${wav_dir} ${feat_dir} $SamplingFreq 
+fi
+
+if [ "$copy" = true ]; then
+    echo "Copying features to acoustic data directory..."
+    acoustic_data_dir=experiments/${Voice}/acoustic_model/data
+    cp -r ${feat_dir}/* $acoustic_data_dir
+    echo "done...!"
+fi