A simple command line tool for text to sound generation, using AudioCLIP and a neural net generator. If experiments are satisfactory, I will combine this with Big Sleep for synthesizing both image and audio from text.
@misc{guzhov2021audioclip,
title = {AudioCLIP: Extending CLIP to Image, Text and Audio},
author = {Andrey Guzhov and Federico Raue and Jörn Hees and Andreas Dengel},
year = {2021},
eprint = {2106.13043},
archivePrefix = {arXiv},
primaryClass = {cs.SD}
}