ref

astorfi · Jul 18, 2018 · 4c2d901 · 4c2d901
1 parent ae9ceec
commit 4c2d901
Show file tree

Hide file tree

Showing 6 changed files with 508 additions and 28 deletions.
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -1,3 +1,90 @@
+@inproceedings{hirsch2000aurora,
+  title={The Aurora experimental framework for the performance evaluation of speech recognition systems under noisy conditions},
+  author={Hirsch, Hans-G{\"u}nter and Pearce, David},
+  booktitle={ASR2000-Automatic Speech Recognition: Challenges for the new Millenium ISCA Tutorial and Research Workshop (ITRW)},
+  year={2000}
+}
+
+@book{guyon2008feature,
+  title={Feature extraction: foundations and applications},
+  author={Guyon, Isabelle and Gunn, Steve and Nikravesh, Masoud and Zadeh, Lofti A},
+  volume={207},
+  year={2008},
+  publisher={Springer}
+}
+
+@article{furui1986speaker,
+  title={Speaker-independent isolated word recognition using dynamic features of speech spectrum},
+  author={Furui, Sadaoki},
+  journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
+  volume={34},
+  number={1},
+  pages={52--59},
+  year={1986},
+  publisher={IEEE}
+}
+
+@book{yu2016automatic,
+  title={AUTOMATIC SPEECH RECOGNITION.},
+  author={Yu, Dong and Deng, Li},
+  year={2016},
+  publisher={Springer}
+}
+
+@book{rabiner1993fundamentals,
+  title={Fundamentals of speech recognition},
+  author={Rabiner, Lawrence R and Juang, Biing-Hwang},
+  volume={14},
+  year={1993},
+  publisher={PTR Prentice Hall Englewood Cliffs}
+}
+
+@article{campbell1997speaker,
+  title={Speaker recognition: A tutorial},
+  author={Campbell, Joseph P},
+  journal={Proceedings of the IEEE},
+  volume={85},
+  number={9},
+  pages={1437--1462},
+  year={1997},
+  publisher={IEEE}
+}
+
+
+@inproceedings{deng2013recent,
+  title={Recent advances in deep learning for speech research at Microsoft},
+  author={Deng, Li and Li, Jinyu and Huang, Jui-Ting and Yao, Kaisheng and Yu, Dong and Seide, Frank and Seltzer, Michael and Zweig, Geoff and He, Xiaodong and Williams, Jason and others},
+  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on},
+  pages={8604--8608},
+  year={2013},
+  organization={IEEE}
+}
+
+@inproceedings{lee2009unsupervised,
+  title={Unsupervised feature learning for audio classification using convolutional deep belief networks},
+  author={Lee, Honglak and Pham, Peter and Largman, Yan and Ng, Andrew Y},
+  booktitle={Advances in neural information processing systems},
+  pages={1096--1104},
+  year={2009}
+}
+
+@inproceedings{yu2011improved,
+  title={Improved bottleneck features using pretrained deep neural networks},
+  author={Yu, Dong and Seltzer, Michael L},
+  booktitle={Twelfth Annual Conference of the International Speech Communication Association},
+  year={2011}
+}
+
+@article{giannakopoulos2015pyaudioanalysis,
+  title={pyAudioAnalysis: An Open-Source Python Library for Audio Signal Analysis},
+  author={Giannakopoulos, Theodoros},
+  journal={PloS one},
+  volume={10},
+  number={12},
+  year={2015},
+  publisher={Public Library of Science}
+}
+
 @article{torfi2017text,
   title={Text-independent speaker verification using 3d convolutional neural networks},
   author={Torfi, Amirsina and Nasrabadi, Nasser M and Dawson, Jeremy},
@@ -15,23 +102,106 @@ @article{torfi20173d
   publisher={IEEE}
 }
 
-@article{torfi2018speechpy,
-  title={Speechpy-a library for speech processing and recognition},
-  author={Torfi, Amirsina},
-  journal={arXiv preprint arXiv:1803.01094},
-  year={2018}
+@article{prechelt2000empirical,
+  title={An empirical comparison of c, c++, java, perl, python, rexx and tcl},
+  author={Prechelt, Lutz},
+  journal={IEEE Computer},
+  volume={33},
+  number={10},
+  pages={23--29},
+  year={2000}
 }
 
-@article{mobiny2018text,
-  title={Text-Independent Speaker Verification Using Long Short-Term Memory Networks},
-  author={Mobiny, Aryan},
-  journal={arXiv preprint arXiv:1805.00604},
-  year={2018}
+@misc{torfispeechpy,
+    author       = {Amirsina Torfi},
+    title        = {{SpeechPy: Speech recognition and feature extraction}},
+    month        = aug,
+    year         = 2017,
+    doi          = {10.5281/zenodo.810391},
+    url          = {https://doi.org/10.5281/zenodo.810391}}
+
+@article{torfi2017coupled,
+  title={Coupled 3D Convolutional Neural Networks for Audio-Visual Recognition},
+  author={Torfi, Amirsina and Iranmanesh, Seyed Mehdi and Nasrabadi, Nasser M and Dawson, Jeremy},
+  journal={arXiv preprint arXiv:1706.05739},
+  year={2017}
+}
+
+@article{torfi2017construction,
+  title={On the Construction of Polar Codes for Achieving the Capacity of Marginal Channels},
+  author={Torfi, Amisina and Soleymani, Sobhan and Vakili, Vahid Tabataba},
+  journal={arXiv preprint arXiv:1707.04512},
+  year={2017}
+}
+
+@article{shannon2001mathematical,
+  title={A mathematical theory of communication},
+  author={Shannon, Claude Elwood},
+  journal={ACM SIGMOBILE Mobile Computing and Communications Review},
+  volume={5},
+  number={1},
+  pages={3--55},
+  year={2001},
+  publisher={ACM}
+}
+
+@article{gurban2009information,
+  title={Information theoretic feature extraction for audio-visual speech recognition},
+  author={Gurban, Mihai and Thiran, Jean-Philippe},
+  journal={IEEE Transactions on signal processing},
+  volume={57},
+  number={12},
+  pages={4765--4776},
+  year={2009},
+  publisher={IEEE}
 }
 
-@article{salehghaffari2018speaker,
-  title={Speaker Verification using Convolutional Neural Networks},
-  author={Salehghaffari, Hossein},
-  journal={arXiv preprint arXiv:1803.05427},
+@inproceedings{variani2014deep,
+  title={Deep neural networks for small footprint text-dependent speaker verification},
+  author={Variani, Ehsan and Lei, Xin and McDermott, Erik and Moreno, Ignacio Lopez and Gonzalez-Dominguez, Javier},
+  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2014 IEEE International Conference on},
+  pages={4052--4056},
+  year={2014},
+  organization={IEEE}
+}
+
+@article{hinton2012deep,
+  title={Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups},
+  author={Hinton, Geoffrey and Deng, Li and Yu, Dong and Dahl, George E and Mohamed, Abdel-rahman and Jaitly, Navdeep and Senior, Andrew and Vanhoucke, Vincent and Nguyen, Patrick and Sainath, Tara N and others},
+  journal={IEEE Signal Processing Magazine},
+  volume={29},
+  number={6},
+  pages={82--97},
+  year={2012},
+  publisher={IEEE}
+}
+
+@article{lecun2015deep,
+  title={Deep learning},
+  author={LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey},
+  journal={nature},
+  volume={521},
+  number={7553},
+  pages={436},
+  year={2015},
+  publisher={Nature Publishing Group}
+}
+
+@article{liu2015deep,
+  title={Deep feature for text-dependent speaker verification},
+  author={Liu, Yuan and Qian, Yanmin and Chen, Nanxin and Fu, Tianfan and Zhang, Ya and Yu, Kai},
+  journal={Speech Communication},
+  volume={73},
+  pages={1--13},
+  year={2015},
+  publisher={Elsevier}
+}
+
+@article{torfi2018attention,
+  title={Attention-Based Guided Structured Sparsity of Deep Neural Networks},
+  author={Torfi, Amirsina and Shirvani, Rouzbeh A},
+  journal={arXiv preprint arXiv:1802.09902},
   year={2018}
 }
+
+
diff --git a/paper/paper.md b/paper/paper.md
@@ -13,26 +13,162 @@ date: 15 May 2018
 bibliography: paper.bib
 ---
 
+# Abstract
+SpeechPy is an open source Python package that contains speech preprocessing techniques, speech features, and important post-processing operations. It provides most frequent used speech features including MFCCs and filterbank energies alongside with the log-energy of filter-banks. The aim of the package is to provide researchers with a simple tool for speech feature extraction and processing purposes in applications such as Automatic Speech Recognition and Speaker Verification.
 
-# Summary
+# Introduction
+Automatic Speech Recognition (ASR) requires three main components for
+further analysis: Preprocessing, feature extraction, and
+post-processing. Feature extraction, in an abstract meaning, is
+extracting descriptive features from raw signal for speech
+classification purposes. Due to the high
+dimensionality, the raw signal can be less informative compared to
+extracted higher level features. Feature extraction comes to our rescue
+for turning the high dimensional signal to a lower dimensional and yet
+more informative version of that for sound recognition and
+classification [@furui1986speaker; @guyon2008feature; @hirsch2000aurora].
 
-``SpeechPy`` is an open source package that contains speech preprocessing techniques, speech features, and important post-processing operations. It provides most frequent used speech features including MFCCs and filterbank energies alongside with the log-energy of filter-banks. The aim of the package is to provide researchers with a simple tool for speech feature extraction and processing purposes in applications such as Automatic Speech Recognition and Speaker Verification.
+![Scheme of speech recognition](_imgs/Scheme_of_speech_recognition_system.png)
 
-Considering the recent advent of deep learning in ASR and SR and the importance of the accurate speech feature extraction, here are the motivations behind ``SpeechPy`` package:
+Feature extraction, in essence, should be done considering the specific
+application at hand. For example, in ASR applications, the linguistic
+characteristics of the raw signal are of great importance and the other
+characteristics must be
+ignored [@yu2016automatic; @rabiner1993fundamentals]. On the other hand,
+in Speaker Recognition (SR) task, solely voice-associated information
+must be contained in extracted feature [@campbell1997speaker]. So the
+feature extraction goal is to extract the relevant feature from the raw
+signal and map it to a lower dimensional feature space. The problem of
+feature extraction has been investigated in pattern classification aimed
+at preventing the curse of dimensionality. There are some feature
+extraction approaches based on information theory
+[@torfi2017construction; @shannon2001mathematical] applied to multimodal
+signals and demonstrated promising results [@gurban2009information].
 
-  *  Developing a free open source package which covers important preprocessing techniques,
-speech features, and post-processing operations required for ASR and SR applications.
+The speech features can be categorized into two general types of
+acoustic and linguistic features. The former one is mainly related to
+non-verbal sounds and the later one is associated with ASR and SR
+systems for which verbal part has the major role. Perhaps one the most
+famous linguistic feature which is hard to beat is the Mel-Frequency
+Cepstral Coefficients (MFCC). It uses speech raw frames in the range
+from 20ms to 40ms for having stationary
+characteristics [@rabiner1993fundamentals]. MFCC is widely used for both
+ASR and SR tasks and more recently in the associated deep learning
+applications as the input to the network rather than directly feeding
+the signal [@deng2013recent; @lee2009unsupervised; @yu2011improved].
+With the advent of deep learning [@lecun2015deep; @torfi2018attention],
+major improvements have been achieved by using deep neural networks
+rather than traditional methods for speech recognition
+applications [@variani2014deep; @hinton2012deep; @liu2015deep].
 
-  * A simple package with a minimum degree of complexity should be available for beginners.
+With the availability of free software for speech recognition such as
+VOICEBOX[^1], most of these softwares are Matlab-based which limits
+their reproducibility due to commercial issues. Another great package is
+PyAudioAnalysis [@giannakopoulos2015pyaudioanalysis], which is a
+comprehensive package developed in Python. However, the issue with
+PyAudioAnalysis is that its complexity and being too verbose for
+extracting simple features and it also lacks some important
+preprocessing and post-processing operations for its current version.
 
-  * A well-tested and continuously integrated package for future developments should be developed.
+Considering the recent advent of deep learning in ASR and SR and the
+importance of the accurate speech feature extraction, here are the
+motivations behind SpeechPy package:
 
-``SpeechPy`` has been developed to satisfy the aforementioned needs. It contains the most important
-preprocessing and post-processing operations and a selection of frequently used speech features. The
-package is free and released as an open source software. Continuous integration using for instant
-error check and validity of changes has been deployed for ``SpeechPy``. Moreover, prior to the latest
-official release of ``SpeechPy``, the package has successfully been utilized for research purposes
+  * Developing a free open source package which covers important
+    preprocessing techniques, speech features, and post-processing
+    operations required for ASR and SR applications.
 
-# Acknowledgements
+  * A simple package with a minimum degree of complexity should be
+    available for beginners.
 
-This work is based upon a work supported by the Center for Identification Technology Research (CITeR) and the National Science Foundation (NSF) under Grant #1650474. This work was completed in part with resources provided by the Center for Identification Technology Research (CITeR) at the West Virginia University. I would like to thank to Professor Nasser Nasrabadi at West Virginia University, for supporting me during developing this project and for his valuable advices.
+  * A well-tested and continuously integrated package for future
+    developments should be developed.
+
+SpeechPy has been developed to satisfy the aforementioned needs. It
+contains the most important preprocessing and post-processing operations
+and a selection of frequently used speech features. The package is free
+and released as an open source software[^2]. Continuous integration
+using for instant error check and validity of changes has been deployed
+for SpeechPy. Moreover, prior to the latest official release of
+SpeechPy, the package has successfully been utilized for research
+purposes [@torfi20173d; @torfi2017text].
+
+# Package Eco-system
+
+
+SpeechPy has been developed using Python language for its interface and
+backed as well. An empirical study demonstrated that Python as a
+scripting language, is more effective and productive than conventional
+languages[^3] for some programming problems and memory consumption is
+often “better than Java and not much worse than C or
+C++” [@prechelt2000empirical]. We chose Python due to its simplicity and
+popularity. Third-party libraries are avoided except *Numpy* and *Scipy*
+for handling data and numeric computations.
+
+## Complexity
+
+As the user should not and does not even need to manipulate the internal
+package structure, object-oriented programming is mostly used for
+package development which provides easier interface for the user with a
+sacrifice to the simplicity of the code. However, the internal code
+complexity of the package does not affect the user experience since the
+modules can easily be called with the associated arguments. SpeechPy is
+a library with a collection of sub-modules.
+
+## Code Style and Documentation
+
+SpeechPy is constructed based on PEP 8 style guide for Python codes.
+Moreover, it is extensively documented using the formatted docstrings
+and Sphinx[^4] for further automatic modifications to the document in
+case of changing internal modules. The full documentation of the project
+will be generated in HTML and PDF format using Sphinx and is hosted
+online. The official releases of the project are hosted on the Zenodo as
+well[^5] [@torfispeechpy].
+
+![A general view of the package](_imgs/packageview.png)
+
+## Continuous Testing and Extensibility
+
+The output of each function has been evaluated as well using different
+tests as opposed to the other existing standard packages. For continuous
+testing, the code is hosted on GitHub and integrated with Travis CI.
+Each modification to the code must pass the unit tests defined for the
+continuous integration. This will ensure the package does not break with
+unadapted code scripts. However, the validity of the modifications
+should always be investigated with the owner or authorized collaborators
+of the project. The code will be tested at each time of modification for
+Python versions *“2.7”*, *“3.4”* and *“3.5”*. In the future, these
+versions are subject to change.
+
+![Travic CI web interface after testing SpeechPy against a new change](_imgs/travicCI.png)
+
+# Availability
+
+## Operating system {#operating-system .unnumbered}
+
+Tested on Ubuntu 14.04 and 16.04 LTS Linux, Apple Mac OS X 10.9.5 , and
+Microsoft Windows 7 & 10. We expect that SpeechPy works on any
+distribution as long as Python and the package dependencies are
+installed.
+
+## Programming language {#programming-language .unnumbered}
+
+The package has been tested Python 2.7, 3.4 and 3.5. However, using
+Python 3.5 is suggested.
+
+## Additional system requirements & dependencies {#additional-system-requirements-dependencies .unnumbered}
+
+SpeechPy is a light package and small computational power would be
+enough for running it. Although the speed of the execution is totally
+dependent to the system architecture. The dependencies are as follows:
+
+  * Numpy
+
+  * SciPy
+
+# Aknowledgement
+
+This work has been completed in part with computational resources
+provided by the West Virginia University and is based upon a work
+supported by the Center for Identification Technology Research (CITeR)
+and the National Science Foundation (NSF) under Grant \#1650474.
diff --git a/paper/test/_imgs/Scheme_of_speech_recognition_system.png b/paper/test/_imgs/Scheme_of_speech_recognition_system.png
diff --git a/paper/test/_imgs/packageview.png b/paper/test/_imgs/packageview.png
diff --git a/paper/test/_imgs/travicCI.png b/paper/test/_imgs/travicCI.png