osl-incubator · xmnlab · Feb 2, 2024 · Feb 2, 2024
diff --git a/README.md b/README.md
@@ -70,6 +70,22 @@ $ artbox voice text-to-speech \
     --lang en-IN
 ```
 
+Additionally, if you are using edge-tts, you can specify `--rate`, `--volume`,
+and `--pitch`, for example:
+
+```bash
+$ echo "Do you want some coffee?" > /tmp/artbox/text.md
+$ artbox voice text-to-speech \
+    --title artbox \
+    --text-path /tmp/artbox/text.md \
+    --output-path /tmp/artbox/voice.mp3 \
+    --engine edge-tts \
+    --lang en \
+    --rate +10% \
+    --volume -10% \
+    --pitch -5Hz
+```
+
 ### Download a youtube video
 
 If you want to download videos from the youtube, you can use the following
@@ -152,10 +168,3 @@ If you want to use Python to play your audio files, you can install `playsound`:
 ```bash
 $ pip wheel --use-pep517 "playsound (==1.3.0)"
 ```
-
-## Troubleshoot
-
-After installing with `poetry install`:
-
-- Patch `pytube` (ref: https://github.com/pytube/pytube/issues/1773):
-  `sed -i 's/(r"^$\\w+\\W")/(r"^\\w+\\W")/' $CONDA_PREFIX/lib/python3.*/site-packages/pytube/cipher.py`
diff --git a/docs/index.md b/docs/index.md
@@ -70,6 +70,22 @@ $ artbox voice text-to-speech \
     --lang en-IN
 ```
 
+Additionally, if you are using edge-tts, you can specify `--rate`, `--volume`,
+and `--pitch`, for example:
+
+```bash
+$ echo "Do you want some coffee?" > /tmp/artbox/text.md
+$ artbox voice text-to-speech \
+    --title artbox \
+    --text-path /tmp/artbox/text.md \
+    --output-path /tmp/artbox/voice.mp3 \
+    --engine edge-tts \
+    --lang en \
+    --rate +10% \
+    --volume -10% \
+    --pitch -5Hz
+```
+
 ### Download a youtube video
 
 If you want to download videos from the youtube, you can use the following

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,7 +19,6 @@ exclude = [
 
 [tool.poetry.dependencies]
 python = ">3.8.1,<3.12"
-pytube = ">=15.0.0"
 pycairo = ">=1.24.0"
 pygobject = ">=3.44.1"
 openai = ">=1"
@@ -32,6 +31,7 @@ gtts = ">=2.3.2"
 edge-tts = ">=6.1.8"
 numpy = ">=1.20"
 typer = ">=0.9.0"
+pytubefix = ">=1.13.3"
 
 [tool.poetry.group.dev.dependencies]
 pytest = ">=7.3.2"
@@ -113,6 +113,6 @@ module = [
   "noisereduce",
   "pydub",
   "pydub.generators",
-  "pytube",
+  "pytubefix",
 ]
 ignore_missing_imports = true
diff --git a/src/artbox/cli.py b/src/artbox/cli.py
@@ -66,7 +66,7 @@ def main(
 
 
 @app_voice.command("text-to-speech")
-def text_to_speech(
+def voice_text_to_speech(
     title: Annotated[
         str, typer.Option("--title", help="Specify the name of the audio file")
     ] = "artbox",
@@ -93,6 +93,18 @@ def text_to_speech(
             "--lang", help="Choose the language for audio generation"
         ),
     ] = "en",
+    rate: Annotated[
+        str,
+        typer.Option("--rate", help="Decrease/Increase the rate level"),
+    ] = "+0%",
+    volume: Annotated[
+        str,
+        typer.Option("--volume", help="Decrease/Increase the volume level"),
+    ] = "+0%",
+    pitch: Annotated[
+        str,
+        typer.Option("--pitch", help="Decrease/Increase the pitch level"),
+    ] = "+0Hz",
 ) -> None:
     """Convert text to speech."""
     args_dict = {
@@ -101,14 +113,17 @@ def text_to_speech(
         "output-path": output_path,
         "engine": engine,
         "lang": lang,
+        "rate": rate,
+        "volume": volume,
+        "pitch": pitch,
     }
 
     runner = Voice(args_dict)
     runner.text_to_speech()
 
 
 @app_sound.command("notes-to-audio")
-def notes_to_audio(
+def sound_notes_to_audio(
     input_path: Annotated[
         str,
         typer.Option(
@@ -138,7 +153,7 @@ def notes_to_audio(
 
 
 @app_video.command("remove-audio")
-def remove_audio(
+def video_remove_audio(
     input_path: Annotated[
         str,
         typer.Option(
@@ -163,7 +178,7 @@ def remove_audio(
 
 
 @app_video.command("extract-audio")
-def extract_audio(
+def video_extract_audio(
     input_path: Annotated[
         str,
         typer.Option(
@@ -189,7 +204,7 @@ def extract_audio(
 
 
 @app_video.command("combine-video-and-audio")
-def combine_audio_and_video(
+def video_combine_audio_and_video(
     video_path: Annotated[
         str,
         typer.Option(
@@ -222,7 +237,7 @@ def combine_audio_and_video(
 
 
 @app_youtube.command("download")
-def download_youtube_video(
+def youtube_download(
     url: Annotated[
         str,
         typer.Option(
@@ -252,3 +267,42 @@ def download_youtube_video(
 
     runner = Youtube(args_dict)
     runner.download()
+
+
+@app_youtube.command("cc")
+def youtube_cc(
+    url: Annotated[
+        str,
+        typer.Option(
+            "--url", help="Specify the URL of the YouTube video to download"
+        ),
+    ] = "",
+    output_path: Annotated[
+        str,
+        typer.Option(
+            "--output-path",
+            help=(
+                "Specify the path to store the downloaded video file "
+                "(.srt, .txt)"
+            ),
+        ),
+    ] = "/tmp/cc.txt",
+    lang: Annotated[
+        str,
+        typer.Option("--lang", help="Set the CC language to be downloaded"),
+    ] = "en",
+    format: Annotated[
+        str,
+        typer.Option("--format", help="Set the CC format (srt, text)"),
+    ] = "text",
+) -> None:
+    """Download youtube video CC."""
+    args_dict = {
+        "url": url,
+        "output-path": output_path,
+        "lang": lang,
+        "format": format,
+    }
+
+    runner = Youtube(args_dict)
+    runner.download_captions()
diff --git a/src/artbox/videos.py b/src/artbox/videos.py
@@ -6,7 +6,7 @@
 from abc import abstractmethod
 
 from moviepy.editor import AudioFileClip, VideoFileClip
-from pytube import YouTube as PyYouTube
+from pytubefix import YouTube as PyYouTube
 
 from artbox.base import ArtBox
 
@@ -20,6 +20,27 @@ def download(self):
         ...
 
 
+def _convert_srt_to_plain_text(srt_text: str) -> str:
+    """
+    Convert an SRT file to plain text by removing timestamps and formatting.
+
+    Parameters
+    ----------
+    srt_file_path (str): Path to the SRT file.
+
+    Returns
+    -------
+    str: The extracted plain text from the SRT file.
+    """
+    plain_text = []
+    # Skip lines that are part of SRT formatting (timestamps, etc.)
+    for line in srt_text.split("\n"):
+        if line.strip() and not line.strip().isdigit() and "-->" not in line:
+            plain_text.append(line.strip())
+
+    return "\n".join(plain_text)
+
+
 class Youtube(DownloadBase):
     """Set of tools for handing videos."""
 
@@ -52,6 +73,27 @@ def download(self):
         except Exception as e:
             print(f"Failed to download video: {e}")
 
+    def download_captions(self):
+        """Download the English closed captions of a YouTube video."""
+        video_url = self.args.get("url", "")
+        lang = self.args.get("lang", "en")
+        format = self.args.get("format", "text")
+
+        yt = PyYouTube(video_url)
+        caption = yt.captions.get_by_language_code(f"a.{lang}")
+
+        if not caption:
+            print(f"No captions found for language {lang}.")
+            return
+
+        # Save the captions to a file
+        cc = caption.generate_srt_captions()
+        with open(str(self.output_path), "w") as f:
+            if format == "text":
+                cc = _convert_srt_to_plain_text(cc)
+            f.write(cc)
+        print("Captions downloaded successfully.")
+
 
 class Video(ArtBox):
     """Set of tools for handing videos."""

diff --git a/src/artbox/voices.py b/src/artbox/voices.py
@@ -79,6 +79,9 @@ async def async_text_to_speech(self) -> None:
         title: str = self.args.get("title", "")
         text_path: str = self.args.get("text-path", "")
         lang: str = self.args.get("lang", "en")
+        rate = self.args.get("rate", "+0%")
+        volume = self.args.get("volume", "+0%")
+        pitch = self.args.get("pitch", "+0Hz")
 
         if not title:
             raise Exception("Argument `title` not given")
@@ -96,8 +99,9 @@ async def async_text_to_speech(self) -> None:
         communicate = edge_tts.Communicate(
             text=text,
             voice=random.choice(voice_options)["Name"],
-            rate="+5%",
-            volume="+0%",
+            rate=rate,
+            volume=volume,
+            pitch=pitch,
         )
         with open(self.output_path, "wb") as file:
             async for chunk in communicate.stream():