autogpt/speak.py

import os

import requests
from playsound import playsound

from autogpt.config import Config

import threading
from threading import Lock, Semaphore

import gtts

cfg = Config()

# Default voice IDs
default_voices = ["ErXwobaYiN019PkySvjV", "EXAVITQu4vr4xnSDxMaL"]

# Retrieve custom voice IDs from the Config class
custom_voice_1 = cfg.elevenlabs_voice_1_id
custom_voice_2 = cfg.elevenlabs_voice_2_id

# Placeholder values that should be treated as empty
placeholders = {"your-voice-id"}

# Use custom voice IDs if provided and not placeholders, otherwise use default voice IDs
voices = [
    custom_voice_1
    if custom_voice_1 and custom_voice_1 not in placeholders
    else default_voices[0],
    custom_voice_2
    if custom_voice_2 and custom_voice_2 not in placeholders
    else default_voices[1],
]

tts_headers = {"Content-Type": "application/json", "xi-api-key": cfg.elevenlabs_api_key}

mutex_lock = Lock()  # Ensure only one sound is played at a time
queue_semaphore = Semaphore(
    1
)  # The amount of sounds to queue before blocking the main thread


def eleven_labs_speech(text, voice_index=0):
    """使用elevenlabs.io的API朗读文本"""
    tts_url = "https://api.elevenlabs.io/v1/text-to-speech/{voice_id}".format(
        voice_id=voices[voice_index]
    )
    formatted_message = {"text": text}
    response = requests.post(tts_url, headers=tts_headers, json=formatted_message)

    if response.status_code == 200:
        with mutex_lock:
            with open("speech.mpeg", "wb") as f:
                f.write(response.content)
            playsound("speech.mpeg", True)
            os.remove("speech.mpeg")
        return True
    else:
        print("请求失败，状态码为:", response.status_code)
        print("响应内容:", response.content)
        return False


def brian_speech(text):
    """Speak text using Brian with the streamelements API"""
    tts_url = f"https://api.streamelements.com/kappa/v2/speech?voice=Brian&text={text}"
    response = requests.get(tts_url)

    if response.status_code == 200:
        with mutex_lock:
            with open("speech.mp3", "wb") as f:
                f.write(response.content)
            playsound("speech.mp3")
            os.remove("speech.mp3")
        return True
    else:
        print("Request failed with status code:", response.status_code)
        print("Response content:", response.content)
        return False


def gtts_speech(text):
    tts = gtts.gTTS(text)
    with mutex_lock:
        tts.save("speech.mp3")
        playsound("speech.mp3", True)
        os.remove("speech.mp3")


def macos_tts_speech(text, voice_index=0):
    if voice_index == 0:
        os.system(f'say "{text}"')
    else:
        if voice_index == 1:
            os.system(f'say -v "Ava (Premium)" "{text}"')
        else:
            os.system(f'say -v Samantha "{text}"')


def say_text(text, voice_index=0):
    def speak():
        if not cfg.elevenlabs_api_key:
            if cfg.use_mac_os_tts == "True":
                macos_tts_speech(text)
            elif cfg.use_brian_tts == "True":
                success = brian_speech(text)
                if not success:
                    gtts_speech(text)
            else:
                gtts_speech(text)
        else:
            success = eleven_labs_speech(text, voice_index)
            if not success:
                gtts_speech(text)

        queue_semaphore.release()

    queue_semaphore.acquire(True)
    thread = threading.Thread(target=speak)
    thread.start()