diff --git a/app.py b/app.py index 97fa9e82..42850b6b 100644 --- a/app.py +++ b/app.py @@ -7,10 +7,10 @@ """ from tempfile import NamedTemporaryFile +import argparse import torch import gradio as gr from audiocraft.models import MusicGen - from audiocraft.data.audio import audio_write @@ -61,90 +61,150 @@ def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef): return waveform_video -with gr.Blocks() as demo: - gr.Markdown( - """ - # MusicGen +def ui(**kwargs): + with gr.Blocks() as interface: + gr.Markdown( + """ + # MusicGen + + This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation + presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284). +
+ + Duplicate Space + for longer sequences, more control and no queue.

+ """ + ) + with gr.Row(): + with gr.Column(): + with gr.Row(): + text = gr.Text(label="Input Text", interactive=True) + melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True) + with gr.Row(): + submit = gr.Button("Submit") + with gr.Row(): + model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True) + with gr.Row(): + duration = gr.Slider(minimum=1, maximum=30, value=10, label="Duration", interactive=True) + with gr.Row(): + topk = gr.Number(label="Top-k", value=250, interactive=True) + topp = gr.Number(label="Top-p", value=0, interactive=True) + temperature = gr.Number(label="Temperature", value=1.0, interactive=True) + cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True) + with gr.Column(): + output = gr.Video(label="Generated Music") + submit.click(predict, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output]) + gr.Examples( + fn=predict, + examples=[ + [ + "An 80s driving pop song with heavy drums and synth pads in the background", + "./assets/bach.mp3", + "melody" + ], + [ + "A cheerful country song with acoustic guitars", + "./assets/bolero_ravel.mp3", + "melody" + ], + [ + "90s rock song with electric guitar and heavy drums", + None, + "medium" + ], + [ + "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions", + "./assets/bach.mp3", + "melody" + ], + [ + "lofi slow bpm electro chill with organic samples", + None, + "medium", + ], + ], + inputs=[text, melody, model], + outputs=[output] + ) + gr.Markdown( + """ + ### More details + + The model will generate a short music extract based on the description you provided. + You can generate up to 30 seconds of audio. + + We present 4 model variations: + 1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only. + 2. Small -- a 300M transformer decoder conditioned on text only. + 3. Medium -- a 1.5B transformer decoder conditioned on text only. + 4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.) + + When using `melody`, ou can optionaly provide a reference audio from + which a broad melody will be extracted. The model will then try to follow both the description and melody provided. + + You can also use your own GPU or a Google Colab by following the instructions on our repo. + See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft) + for more details. + """ + ) - This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation - presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284). -
- - Duplicate Space - for longer sequences, more control and no queue.

- """ + # Show the interface + launch_kwargs = {} + username = kwargs.get('username') + password = kwargs.get('password') + server_port = kwargs.get('server_port', 0) + inbrowser = kwargs.get('inbrowser', False) + share = kwargs.get('share', False) + server_name = kwargs.get('listen') + + launch_kwargs['server_name'] = server_name + + if username and password: + launch_kwargs['auth'] = (username, password) + if server_port > 0: + launch_kwargs['server_port'] = server_port + if inbrowser: + launch_kwargs['inbrowser'] = inbrowser + if share: + launch_kwargs['share'] = share + + interface.launch(**launch_kwargs) + +if __name__ == "__main__": + # torch.cuda.set_per_process_memory_fraction(0.48) + parser = argparse.ArgumentParser() + parser.add_argument( + '--listen', + type=str, + default='127.0.0.1', + help='IP to listen on for connections to Gradio', ) - with gr.Row(): - with gr.Column(): - with gr.Row(): - text = gr.Text(label="Input Text", interactive=True) - melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True) - with gr.Row(): - submit = gr.Button("Submit") - with gr.Row(): - model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True) - with gr.Row(): - duration = gr.Slider(minimum=1, maximum=30, value=10, label="Duration", interactive=True) - with gr.Row(): - topk = gr.Number(label="Top-k", value=250, interactive=True) - topp = gr.Number(label="Top-p", value=0, interactive=True) - temperature = gr.Number(label="Temperature", value=1.0, interactive=True) - cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True) - with gr.Column(): - output = gr.Video(label="Generated Music") - submit.click(predict, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output]) - gr.Examples( - fn=predict, - examples=[ - [ - "An 80s driving pop song with heavy drums and synth pads in the background", - "./assets/bach.mp3", - "melody" - ], - [ - "A cheerful country song with acoustic guitars", - "./assets/bolero_ravel.mp3", - "melody" - ], - [ - "90s rock song with electric guitar and heavy drums", - None, - "medium" - ], - [ - "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions", - "./assets/bach.mp3", - "melody" - ], - [ - "lofi slow bpm electro chill with organic samples", - None, - "medium", - ], - ], - inputs=[text, melody, model], - outputs=[output] + parser.add_argument( + '--username', type=str, default='', help='Username for authentication' + ) + parser.add_argument( + '--password', type=str, default='', help='Password for authentication' ) - gr.Markdown( - """ - ### More details - - The model will generate a short music extract based on the description you provided. - You can generate up to 30 seconds of audio. - - We present 4 model variations: - 1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only. - 2. Small -- a 300M transformer decoder conditioned on text only. - 3. Medium -- a 1.5B transformer decoder conditioned on text only. - 4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.) - - When using `melody`, ou can optionaly provide a reference audio from - which a broad melody will be extracted. The model will then try to follow both the description and melody provided. - - You can also use your own GPU or a Google Colab by following the instructions on our repo. - See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft) - for more details. - """ + parser.add_argument( + '--server_port', + type=int, + default=0, + help='Port to run the server listener on', ) + parser.add_argument( + '--inbrowser', action='store_true', help='Open in browser' + ) + parser.add_argument( + '--share', action='store_true', help='Share the gradio UI' + ) + + args = parser.parse_args() -demo.launch() + ui( + username=args.username, + password=args.password, + inbrowser=args.inbrowser, + server_port=args.server_port, + share=args.share, + listen=args.listen + ) \ No newline at end of file