speech draft helper added

yohasebe · Jun 21, 2024 · 119906e · 119906e
1 parent b4c7711
commit 119906e
Show file tree

Hide file tree

Showing 12 changed files with 449 additions and 14 deletions.
diff --git a/bin/monadic_dev_start b/bin/monadic_dev_start
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Navigate to the ruby directory
+cd ./docker/services/ruby
+
+# Execute the monadic script with 'start' argument
+./bin/monadic_dev start
+
+echo "Monadic script executed with 'start' argument 🚀"
diff --git a/docker/monadic.sh b/docker/monadic.sh
@@ -6,7 +6,7 @@ export PATH=$PATH:/usr/local/bin
 export SELENIUM_IMAGE="selenium/standalone-chrome:123.0"
 # export SELENIUM_IMAGE="seleniarm/standalone-chromium:123.0"
 
-export MONADIC_VERSION=0.5.92
+export MONADIC_VERSION=0.6.0
 
 export HOST_OS=$(uname -s)
 
@@ -66,8 +66,8 @@ start_docker() {
 build_docker_compose() {
   remove_containers
 
-  # $DOCKER compose -f "$ROOT_DIR/services/docker-compose.yml" build --no-cache
-  $DOCKER compose -f "$ROOT_DIR/services/docker-compose.yml" build
+  $DOCKER compose -f "$ROOT_DIR/services/docker-compose.yml" build --no-cache
+  # $DOCKER compose -f "$ROOT_DIR/services/docker-compose.yml" build
   echo [HTML]: "<p>Monadic Chat has been built successfully!</p>"
 
   $DOCKER  tag yohasebe/monadic-chat:$MONADIC_VERSION yohasebe/monadic-chat:latest

diff --git a/docker/services/ruby/apps/speech_draft_helper/speech_draft_helper_app.rb b/docker/services/ruby/apps/speech_draft_helper/speech_draft_helper_app.rb
@@ -0,0 +1,199 @@
+class SpeechDraftHelper < MonadicApp
+
+  def icon
+    "<i class='fas fa-user-tie'></i>"
+  end
+
+  def description
+    "This app allows the user to submit a speech draft in the form of just a text string, a word file, or a pdf file. The app will then analyze it and return a revised version. The app will also provide suggestions for improvement and tips on how to make the speech more engaging and effective if the user needs them. if the user needs them Besides, it can also provide an mp3 file of the speech."
+  end
+
+  def initial_prompt
+    text = <<~TEXT
+      You are a speech draft helper assistant. You can help users with their speech drafts. Users can submit a speech draft in the form of a text string, a Word file, or a PDF file. You can analyze the speech and provide a revised version of the draft. You also provides feedback on its content, structure, and delivery if the user needs them. You can also provide suggestions for improvement and tips on how to make the speech more engaging and effective. If the user asks for it, you can provide an MP3 file of the speech according to their requirements (e.g., speed, voice, language).
+
+      First, get a speech draft or idea from the user. The user may give you a text segment in their message, or they may give you the name of a specific file available in your current environment. In that case, use the `fetch_text_from_file` function to fetch text from a text file (e.g., markdown, text, program scripts, etc.), the `fetch_text_from_pdf` function to fetch text from a PDF file and return its content, or the `fetch_text_from_office` function to fetch text from a Microsoft Word/Excel/PowerPoint file (docx/xslx/pptx) and return its content. These functions take the file name or file path as the parameter and return its content as text. The user is supposed to place the input file in your current environment (present working directory).
+
+      Alternatively, the user may give you a web URL. Then, please fetch the content of the web page using the `fetch_web_content` function. The function takes the web page URL as the parameter and saves its contents in a file. Read the file content and use it to answer the user's questions.
+
+      If the user requests an explanation of a specific image, you can use the `analyze_image` function to analyze the image and return the result. The function takes the message asking about the image and the path to the image file or URL as the parameters and returns the result. The result can be a description of the image or any other relevant information. In your response, present the text description and the <img> tag to display the image (e.g. `<img src="FILE_NAME" />`).
+
+      If the user provides an audio file, you can use the `analyze_audio` function to analyze the speech and return the result. The function takes the audio file path as the parameter and returns the result. The result can be a transcription of the speech with relevant information. In your response, present the text transcription and the <audio> tag to play the audio (`<audio controls src="FILE_NAME"></audio>`).
+
+      Once you have received the speech draft, analyze it and provide a revised version of the draft. You can provide feedback on its content, structure, and delivery.
+
+      If the user requests for it, provide an MP3 file of the speech. You can use the `text_to_speech` tool to provide an MP3 file of the speech. The tool takes the speech text and other parameters and returns the filename of the MP3 file of the speech. Here are the parameters you can use:
+
+      - `text`: The speech text to convert to speech.
+      - `speed`: Speed of the speech. Default is 1.0.
+      - `voice`: Voice of the speech. For male voices, you can use 'alloy', 'echo', 'fable', or 'onyx'. For female voices, you can use 'nova' or 'shimmer'. Default is 'alloy'.
+      - `language`: Language of the speech in the format "en", "es", etc. Default is 'auto'.
+
+      If you have generated an MP3, present it using the <audio> tag to play the audio (`<audio controls src="FILE_NAME"></audio>`).
+    TEXT
+
+    text.strip
+  end
+
+  def settings
+    {
+      "model": "gpt-4o",
+      "temperature": 0.0,
+      "top_p": 0.0,
+      "context_size": 20,
+      "initial_prompt": initial_prompt,
+      "easy_submit": false,
+      "auto_speech": false,
+      "app_name": "Speech Draft Helper",
+      "description": description,
+      "icon": icon,
+      "initiate_from_assistant": true,
+      "image": true,
+      "audio_video": true,
+      "tools": [
+        {
+          "type": "function",
+          "function":
+          {
+            "name": "fetch_text_from_office",
+            "description": "Fetch the text from the Microsoft Word/Excel/PowerPoint file and return it.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "file": {
+                  "type": "string",
+                  "description": "File name or file path of the Microsoft Word/Excel/PowerPoint file."
+                }
+              }
+            }
+          }
+        },
+        {
+          "type": "function",
+          "function":
+          {
+            "name": "fetch_text_from_pdf",
+            "description": "Fetch the text from the PDF file and return it.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "pdf": {
+                  "type": "string",
+                  "description": "File name or file path of the PDF"
+                }
+              },
+              "required": ["pdf"]
+            }
+          }
+        },
+        {
+          "type": "function",
+          "function":
+          {
+            "name": "fetch_web_content",
+            "description": "Fetch the content of the web page of the given URL and save it to a file.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "url": {
+                  "type": "string",
+                  "description": "URL of the web page."
+                }
+              },
+              "required": ["url"]
+            }
+          }
+        },
+        {
+          "type": "function",
+          "function":
+          {
+            "name": "analyze_image",
+            "description": "Analyze the image and return the result.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "message": {
+                  "type": "string",
+                  "description": "Text prompt asking about the image (e.g. 'What is in the image?')."
+                },
+                "image_path": {
+                  "type": "string",
+                  "description": "Path to the image file. It can be either a local file path or a URL."
+                }
+              },
+              "required": ["message", "image_path"]
+            }
+          }
+        },
+        {
+          "type": "function",
+          "function":
+          {
+            "name": "analyze_audio",
+            "description": "Analyze the audio and return the transcript.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "audio": {
+                  "type": "string",
+                  "description": "File path of the audio file"
+                }
+              },
+              "required": ["audio"]
+            }
+          }
+        },
+        {
+          "type": "function",
+          "function":
+          {
+            "name": "fetch_text_from_file",
+            "description": "Fetch the text from a file and return its content.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "file": {
+                  "type": "string",
+                  "description": "File name or file path"
+                }
+              },
+              "required": ["file"]
+            }
+          }
+        },
+        {
+          "type": "function",
+          "function":
+          {
+            "name": "text_to_speech",
+            "description": "Convert the text to speech to generate an MP3 file and retrun the filename.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "text": {
+                  "type": "string",
+                  "description": "Text to convert to speech."
+                },
+                "speed": {
+                  "type": "string",
+                  "description": "Speed of the speech. Default is 1.0."
+                },
+                "voice": {
+                  "type": "string",
+                  "enum": ["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
+                  "description": "Voice of the speech. Default is 'alloy'."
+                },
+                "language": {
+                  "type": "string",
+                  "description": "Language of the speech. Default is 'auto'."
+                }
+              },
+              "required": ["text"]
+            }
+          }
+        }
+      ]
+    }
+  end
+end
diff --git a/docker/services/ruby/bin/monadic_dev b/docker/services/ruby/bin/monadic_dev
@@ -0,0 +1,93 @@
+#!/usr/bin/env ruby
+
+# frozen_string_literal: false
+
+require "optimist"
+
+require_relative "../lib/monadic/version"
+
+# change current directory to the parent directory of the directory containing this file
+Dir.chdir(File.expand_path(File.join(__dir__, "..")))
+
+DEFAULT_PORT = 4567
+
+selenium_image = "selenium/standalone-chrome:latest"
+if `uname -s`.chomp.include? "Darwin"
+  if `uname -m`.chomp == "arm64"
+    selenium_image = "seleniarm/standalone-chromium:latest"
+  end
+end
+ENV['SELENIUM_IMAGE'] = selenium_image
+ENV['MONADIC_VERSION'] = Monadic::VERSION.to_s
+ENV['HOST_OS'] = `uname -s`.chomp
+
+# Parse command line options
+opts = Optimist.options do
+  version Monadic::VERSION.to_s
+  banner "Usage: monadic [start|stop|restart] [options]"
+  opt :daemonize, "Enable or disable daemon mode", default: false
+  opt :log, "Enable or disable logging mode", default: false
+  opt :port, "Specify the port number", type: :integer, default: DEFAULT_PORT
+end
+
+DOCKER_HOME = File.expand_path(File.join(__dir__, "..", ".."))
+
+# Start the server
+def start_server(opts)
+  start_script = File.join(DOCKER_HOME, "support_scripts", "mac-start-docker.sh")
+  mac_docker_start = File.read(start_script)
+
+  # run the start script
+  system(mac_docker_start)
+  # system("docker compose -f #{DOCKER_HOME}/docker-compose.yml build ruby_service") 
+
+  # stop "monadic-chat-container" if it is running
+  system("docker container stop monadic-chat-ruby-container") if system("docker container ls | grep monadic-chat-ruby-container")
+
+  cmd_selenium = "docker container start monadic-chat-selenium-container"
+  cmd_pg = "docker container start monadic-chat-pgvector-container"
+  cmd_python = "docker container start monadic-chat-python-container"
+
+  system(cmd_selenium)
+  system(cmd_pg)
+  system(cmd_python)
+
+  cmd = "thin start -R #{DOCKER_HOME}/ruby/config.ru -p #{opts[:port] || DEFAULT_PORT}"
+  cmd += " --daemonize" if opts[:daemonize]
+  cmd += " --log thin.log" if opts[:log]
+  if system(cmd)
+    puts "Server started on port #{opts[:port] || DEFAULT_PORT}"
+  else
+    puts "Server failed to start"
+  end
+end
+
+# Stop the server
+def stop_server
+  if system("thin stop")
+    puts "Server stopped"
+  else
+    puts "Server failed to stop"
+  end
+end
+
+# Restart the server
+def restart_server(opts)
+  stop_server
+  start_server(opts)
+end
+
+# Parse subcommand
+subcommand = ARGV.shift
+
+# Execute subcommand
+case subcommand
+when "start"
+  start_server(opts)
+when "stop"
+  stop_server
+when "restart"
+  restart_server(opts)
+else
+  Optimist.die "Unknown subcommand. Use \"start\", \"stop\", or \"restart\"."
+end
diff --git a/docker/services/ruby/lib/embeddings/text_embeddings.rb b/docker/services/ruby/lib/embeddings/text_embeddings.rb
@@ -6,12 +6,6 @@
 require "json"
 require "dotenv/load"
 
-# return true if we are inside a docker container
-def in_container?
-  File.file?("/.dockerenv")
-end
-
-IN_CONTAINER = in_container?
 EMBEDDINGS_MODEL = "text-embedding-3-small"
 
 class TextEmbeddings

diff --git a/docker/services/ruby/lib/helpers/websocket.rb b/docker/services/ruby/lib/helpers/websocket.rb
@@ -94,7 +94,7 @@ def websocket_handler(env)
           res_hash = tts_api_request(text, voice, speed, response_format, model)
           @channel.push(res_hash.to_json)
         when "TTS_STREAM"
-          tts_thread&.join
+          thread&.join
           text = obj["text"]
           voice = obj["voice"]
           speed = obj["speed"]

diff --git a/docker/services/ruby/lib/monadic.rb b/docker/services/ruby/lib/monadic.rb
@@ -25,6 +25,14 @@
 require "oj"
 Oj.mimic_JSON()
 
+
+# return true if we are inside a docker container
+def in_container?
+  File.file?("/.dockerenv")
+end
+
+IN_CONTAINER = in_container?
+
 require_relative "helpers/text_splitter"
 require_relative "helpers/flask_app_client"
 
@@ -49,6 +57,7 @@
 EMBEDDINGS_DB = TextEmbeddings.new("monadic", recreate_db: false)
 
 CONFIG = {}
+
 begin
   if File.file?("/.dockerenv")
     File.read("/monadic/data/.env").split("\n").each do |line|

diff --git a/docker/services/ruby/lib/monadic/monadic_app.rb b/docker/services/ruby/lib/monadic/monadic_app.rb
@@ -372,6 +372,26 @@ def analyze_audio(audio: "")
     send_command(command: command, container: "ruby")
   end
 
+  def text_to_speech(text: "", speed:1.0, voice: "alloy", language: "auto")
+    text = text.gsub(/"/, '\"')
+
+    primary_save_path = "/monadic/data/"
+    secondary_save_path = File.expand_path("~/monadic/data/")
+
+    save_path = Dir.exist?(primary_save_path) ? primary_save_path : secondary_save_path
+    textfile = "#{Time.now.to_i}.md"
+    textpath = File.join(save_path, textfile)
+
+    File.open(textpath, "w") do |f|
+      f.write(text)
+    end
+
+    command = <<~CMD
+      bash -c 'simple_tts_query.rb "#{textpath}" --speed=#{speed} --voice=#{voice} --language=#{language}'
+    CMD
+    send_command(command: command, container: "ruby")
+  end
+
   def generate_image(prompt: "", size: "1024x1024", num_retrials: 10)
     command = <<~CMD
       bash -c 'simple_image_generation.rb -p "#{prompt}" -s "#{size}"'