Skip to content

Commit

Permalink
ref
Browse files Browse the repository at this point in the history
  • Loading branch information
astorfi committed Jul 18, 2018
1 parent ae9ceec commit 4c2d901
Show file tree
Hide file tree
Showing 6 changed files with 508 additions and 28 deletions.
198 changes: 184 additions & 14 deletions paper/paper.bib
Original file line number Diff line number Diff line change
@@ -1,3 +1,90 @@
@inproceedings{hirsch2000aurora,
title={The Aurora experimental framework for the performance evaluation of speech recognition systems under noisy conditions},
author={Hirsch, Hans-G{\"u}nter and Pearce, David},
booktitle={ASR2000-Automatic Speech Recognition: Challenges for the new Millenium ISCA Tutorial and Research Workshop (ITRW)},
year={2000}
}

@book{guyon2008feature,
title={Feature extraction: foundations and applications},
author={Guyon, Isabelle and Gunn, Steve and Nikravesh, Masoud and Zadeh, Lofti A},
volume={207},
year={2008},
publisher={Springer}
}

@article{furui1986speaker,
title={Speaker-independent isolated word recognition using dynamic features of speech spectrum},
author={Furui, Sadaoki},
journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
volume={34},
number={1},
pages={52--59},
year={1986},
publisher={IEEE}
}

@book{yu2016automatic,
title={AUTOMATIC SPEECH RECOGNITION.},
author={Yu, Dong and Deng, Li},
year={2016},
publisher={Springer}
}

@book{rabiner1993fundamentals,
title={Fundamentals of speech recognition},
author={Rabiner, Lawrence R and Juang, Biing-Hwang},
volume={14},
year={1993},
publisher={PTR Prentice Hall Englewood Cliffs}
}

@article{campbell1997speaker,
title={Speaker recognition: A tutorial},
author={Campbell, Joseph P},
journal={Proceedings of the IEEE},
volume={85},
number={9},
pages={1437--1462},
year={1997},
publisher={IEEE}
}


@inproceedings{deng2013recent,
title={Recent advances in deep learning for speech research at Microsoft},
author={Deng, Li and Li, Jinyu and Huang, Jui-Ting and Yao, Kaisheng and Yu, Dong and Seide, Frank and Seltzer, Michael and Zweig, Geoff and He, Xiaodong and Williams, Jason and others},
booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on},
pages={8604--8608},
year={2013},
organization={IEEE}
}

@inproceedings{lee2009unsupervised,
title={Unsupervised feature learning for audio classification using convolutional deep belief networks},
author={Lee, Honglak and Pham, Peter and Largman, Yan and Ng, Andrew Y},
booktitle={Advances in neural information processing systems},
pages={1096--1104},
year={2009}
}

@inproceedings{yu2011improved,
title={Improved bottleneck features using pretrained deep neural networks},
author={Yu, Dong and Seltzer, Michael L},
booktitle={Twelfth Annual Conference of the International Speech Communication Association},
year={2011}
}

@article{giannakopoulos2015pyaudioanalysis,
title={pyAudioAnalysis: An Open-Source Python Library for Audio Signal Analysis},
author={Giannakopoulos, Theodoros},
journal={PloS one},
volume={10},
number={12},
year={2015},
publisher={Public Library of Science}
}

@article{torfi2017text,
title={Text-independent speaker verification using 3d convolutional neural networks},
author={Torfi, Amirsina and Nasrabadi, Nasser M and Dawson, Jeremy},
Expand All @@ -15,23 +102,106 @@ @article{torfi20173d
publisher={IEEE}
}

@article{torfi2018speechpy,
title={Speechpy-a library for speech processing and recognition},
author={Torfi, Amirsina},
journal={arXiv preprint arXiv:1803.01094},
year={2018}
@article{prechelt2000empirical,
title={An empirical comparison of c, c++, java, perl, python, rexx and tcl},
author={Prechelt, Lutz},
journal={IEEE Computer},
volume={33},
number={10},
pages={23--29},
year={2000}
}

@article{mobiny2018text,
title={Text-Independent Speaker Verification Using Long Short-Term Memory Networks},
author={Mobiny, Aryan},
journal={arXiv preprint arXiv:1805.00604},
year={2018}
@misc{torfispeechpy,
author = {Amirsina Torfi},
title = {{SpeechPy: Speech recognition and feature extraction}},
month = aug,
year = 2017,
doi = {10.5281/zenodo.810391},
url = {https://doi.org/10.5281/zenodo.810391}}

@article{torfi2017coupled,
title={Coupled 3D Convolutional Neural Networks for Audio-Visual Recognition},
author={Torfi, Amirsina and Iranmanesh, Seyed Mehdi and Nasrabadi, Nasser M and Dawson, Jeremy},
journal={arXiv preprint arXiv:1706.05739},
year={2017}
}

@article{torfi2017construction,
title={On the Construction of Polar Codes for Achieving the Capacity of Marginal Channels},
author={Torfi, Amisina and Soleymani, Sobhan and Vakili, Vahid Tabataba},
journal={arXiv preprint arXiv:1707.04512},
year={2017}
}

@article{shannon2001mathematical,
title={A mathematical theory of communication},
author={Shannon, Claude Elwood},
journal={ACM SIGMOBILE Mobile Computing and Communications Review},
volume={5},
number={1},
pages={3--55},
year={2001},
publisher={ACM}
}

@article{gurban2009information,
title={Information theoretic feature extraction for audio-visual speech recognition},
author={Gurban, Mihai and Thiran, Jean-Philippe},
journal={IEEE Transactions on signal processing},
volume={57},
number={12},
pages={4765--4776},
year={2009},
publisher={IEEE}
}

@article{salehghaffari2018speaker,
title={Speaker Verification using Convolutional Neural Networks},
author={Salehghaffari, Hossein},
journal={arXiv preprint arXiv:1803.05427},
@inproceedings{variani2014deep,
title={Deep neural networks for small footprint text-dependent speaker verification},
author={Variani, Ehsan and Lei, Xin and McDermott, Erik and Moreno, Ignacio Lopez and Gonzalez-Dominguez, Javier},
booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2014 IEEE International Conference on},
pages={4052--4056},
year={2014},
organization={IEEE}
}

@article{hinton2012deep,
title={Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups},
author={Hinton, Geoffrey and Deng, Li and Yu, Dong and Dahl, George E and Mohamed, Abdel-rahman and Jaitly, Navdeep and Senior, Andrew and Vanhoucke, Vincent and Nguyen, Patrick and Sainath, Tara N and others},
journal={IEEE Signal Processing Magazine},
volume={29},
number={6},
pages={82--97},
year={2012},
publisher={IEEE}
}

@article{lecun2015deep,
title={Deep learning},
author={LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey},
journal={nature},
volume={521},
number={7553},
pages={436},
year={2015},
publisher={Nature Publishing Group}
}

@article{liu2015deep,
title={Deep feature for text-dependent speaker verification},
author={Liu, Yuan and Qian, Yanmin and Chen, Nanxin and Fu, Tianfan and Zhang, Ya and Yu, Kai},
journal={Speech Communication},
volume={73},
pages={1--13},
year={2015},
publisher={Elsevier}
}

@article{torfi2018attention,
title={Attention-Based Guided Structured Sparsity of Deep Neural Networks},
author={Torfi, Amirsina and Shirvani, Rouzbeh A},
journal={arXiv preprint arXiv:1802.09902},
year={2018}
}


164 changes: 150 additions & 14 deletions paper/paper.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,162 @@ date: 15 May 2018
bibliography: paper.bib
---

# Abstract
SpeechPy is an open source Python package that contains speech preprocessing techniques, speech features, and important post-processing operations. It provides most frequent used speech features including MFCCs and filterbank energies alongside with the log-energy of filter-banks. The aim of the package is to provide researchers with a simple tool for speech feature extraction and processing purposes in applications such as Automatic Speech Recognition and Speaker Verification.

# Summary
# Introduction
Automatic Speech Recognition (ASR) requires three main components for
further analysis: Preprocessing, feature extraction, and
post-processing. Feature extraction, in an abstract meaning, is
extracting descriptive features from raw signal for speech
classification purposes. Due to the high
dimensionality, the raw signal can be less informative compared to
extracted higher level features. Feature extraction comes to our rescue
for turning the high dimensional signal to a lower dimensional and yet
more informative version of that for sound recognition and
classification [@furui1986speaker; @guyon2008feature; @hirsch2000aurora].

``SpeechPy`` is an open source package that contains speech preprocessing techniques, speech features, and important post-processing operations. It provides most frequent used speech features including MFCCs and filterbank energies alongside with the log-energy of filter-banks. The aim of the package is to provide researchers with a simple tool for speech feature extraction and processing purposes in applications such as Automatic Speech Recognition and Speaker Verification.
![Scheme of speech recognition](_imgs/Scheme_of_speech_recognition_system.png)

Considering the recent advent of deep learning in ASR and SR and the importance of the accurate speech feature extraction, here are the motivations behind ``SpeechPy`` package:
Feature extraction, in essence, should be done considering the specific
application at hand. For example, in ASR applications, the linguistic
characteristics of the raw signal are of great importance and the other
characteristics must be
ignored [@yu2016automatic; @rabiner1993fundamentals]. On the other hand,
in Speaker Recognition (SR) task, solely voice-associated information
must be contained in extracted feature [@campbell1997speaker]. So the
feature extraction goal is to extract the relevant feature from the raw
signal and map it to a lower dimensional feature space. The problem of
feature extraction has been investigated in pattern classification aimed
at preventing the curse of dimensionality. There are some feature
extraction approaches based on information theory
[@torfi2017construction; @shannon2001mathematical] applied to multimodal
signals and demonstrated promising results [@gurban2009information].

* Developing a free open source package which covers important preprocessing techniques,
speech features, and post-processing operations required for ASR and SR applications.
The speech features can be categorized into two general types of
acoustic and linguistic features. The former one is mainly related to
non-verbal sounds and the later one is associated with ASR and SR
systems for which verbal part has the major role. Perhaps one the most
famous linguistic feature which is hard to beat is the Mel-Frequency
Cepstral Coefficients (MFCC). It uses speech raw frames in the range
from 20ms to 40ms for having stationary
characteristics [@rabiner1993fundamentals]. MFCC is widely used for both
ASR and SR tasks and more recently in the associated deep learning
applications as the input to the network rather than directly feeding
the signal [@deng2013recent; @lee2009unsupervised; @yu2011improved].
With the advent of deep learning [@lecun2015deep; @torfi2018attention],
major improvements have been achieved by using deep neural networks
rather than traditional methods for speech recognition
applications [@variani2014deep; @hinton2012deep; @liu2015deep].

* A simple package with a minimum degree of complexity should be available for beginners.
With the availability of free software for speech recognition such as
VOICEBOX[^1], most of these softwares are Matlab-based which limits
their reproducibility due to commercial issues. Another great package is
PyAudioAnalysis [@giannakopoulos2015pyaudioanalysis], which is a
comprehensive package developed in Python. However, the issue with
PyAudioAnalysis is that its complexity and being too verbose for
extracting simple features and it also lacks some important
preprocessing and post-processing operations for its current version.

* A well-tested and continuously integrated package for future developments should be developed.
Considering the recent advent of deep learning in ASR and SR and the
importance of the accurate speech feature extraction, here are the
motivations behind SpeechPy package:

``SpeechPy`` has been developed to satisfy the aforementioned needs. It contains the most important
preprocessing and post-processing operations and a selection of frequently used speech features. The
package is free and released as an open source software. Continuous integration using for instant
error check and validity of changes has been deployed for ``SpeechPy``. Moreover, prior to the latest
official release of ``SpeechPy``, the package has successfully been utilized for research purposes
* Developing a free open source package which covers important
preprocessing techniques, speech features, and post-processing
operations required for ASR and SR applications.

# Acknowledgements
* A simple package with a minimum degree of complexity should be
available for beginners.

This work is based upon a work supported by the Center for Identification Technology Research (CITeR) and the National Science Foundation (NSF) under Grant #1650474. This work was completed in part with resources provided by the Center for Identification Technology Research (CITeR) at the West Virginia University. I would like to thank to Professor Nasser Nasrabadi at West Virginia University, for supporting me during developing this project and for his valuable advices.
* A well-tested and continuously integrated package for future
developments should be developed.

SpeechPy has been developed to satisfy the aforementioned needs. It
contains the most important preprocessing and post-processing operations
and a selection of frequently used speech features. The package is free
and released as an open source software[^2]. Continuous integration
using for instant error check and validity of changes has been deployed
for SpeechPy. Moreover, prior to the latest official release of
SpeechPy, the package has successfully been utilized for research
purposes [@torfi20173d; @torfi2017text].

# Package Eco-system


SpeechPy has been developed using Python language for its interface and
backed as well. An empirical study demonstrated that Python as a
scripting language, is more effective and productive than conventional
languages[^3] for some programming problems and memory consumption is
often “better than Java and not much worse than C or
C++” [@prechelt2000empirical]. We chose Python due to its simplicity and
popularity. Third-party libraries are avoided except *Numpy* and *Scipy*
for handling data and numeric computations.

## Complexity

As the user should not and does not even need to manipulate the internal
package structure, object-oriented programming is mostly used for
package development which provides easier interface for the user with a
sacrifice to the simplicity of the code. However, the internal code
complexity of the package does not affect the user experience since the
modules can easily be called with the associated arguments. SpeechPy is
a library with a collection of sub-modules.

## Code Style and Documentation

SpeechPy is constructed based on PEP 8 style guide for Python codes.
Moreover, it is extensively documented using the formatted docstrings
and Sphinx[^4] for further automatic modifications to the document in
case of changing internal modules. The full documentation of the project
will be generated in HTML and PDF format using Sphinx and is hosted
online. The official releases of the project are hosted on the Zenodo as
well[^5] [@torfispeechpy].

![A general view of the package](_imgs/packageview.png)

## Continuous Testing and Extensibility

The output of each function has been evaluated as well using different
tests as opposed to the other existing standard packages. For continuous
testing, the code is hosted on GitHub and integrated with Travis CI.
Each modification to the code must pass the unit tests defined for the
continuous integration. This will ensure the package does not break with
unadapted code scripts. However, the validity of the modifications
should always be investigated with the owner or authorized collaborators
of the project. The code will be tested at each time of modification for
Python versions *“2.7”*, *“3.4”* and *“3.5”*. In the future, these
versions are subject to change.

![Travic CI web interface after testing SpeechPy against a new change](_imgs/travicCI.png)

# Availability

## Operating system {#operating-system .unnumbered}

Tested on Ubuntu 14.04 and 16.04 LTS Linux, Apple Mac OS X 10.9.5 , and
Microsoft Windows 7 & 10. We expect that SpeechPy works on any
distribution as long as Python and the package dependencies are
installed.

## Programming language {#programming-language .unnumbered}

The package has been tested Python 2.7, 3.4 and 3.5. However, using
Python 3.5 is suggested.

## Additional system requirements & dependencies {#additional-system-requirements-dependencies .unnumbered}

SpeechPy is a light package and small computational power would be
enough for running it. Although the speed of the execution is totally
dependent to the system architecture. The dependencies are as follows:

* Numpy

* SciPy

# Aknowledgement

This work has been completed in part with computational resources
provided by the West Virginia University and is based upon a work
supported by the Center for Identification Technology Research (CITeR)
and the National Science Foundation (NSF) under Grant \#1650474.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added paper/test/_imgs/packageview.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added paper/test/_imgs/travicCI.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 4c2d901

Please sign in to comment.