forked from argmaxinc/WhisperKit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CLIArguments.swift
111 lines (75 loc) · 4.13 KB
/
CLIArguments.swift
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// For licensing see accompanying LICENSE.md file.
// Copyright © 2024 Argmax, Inc. All rights reserved.
import ArgumentParser
struct CLIArguments: ParsableArguments {
@Option(help: "Paths to audio files")
var audioPath = [String]()
@Option(help: "Path to a folder containing audio files")
var audioFolder: String?
@Option(help: "Path of model files")
var modelPath: String?
@Option(help: "Model to download if no modelPath is provided")
var model: String?
@Option(help: "Text to add in front of the model name to specify between different types of the same variant (values: \"openai\", \"distil\")")
var modelPrefix: String = "openai"
@Option(help: "Path to save the downloaded model")
var downloadModelPath: String?
@Option(help: "Path to save the downloaded tokenizer files")
var downloadTokenizerPath: String?
@Option(help: "Compute units for audio encoder model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine,random}")
var audioEncoderComputeUnits: ComputeUnits = .cpuAndNeuralEngine
@Option(help: "Compute units for text decoder model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine,random}")
var textDecoderComputeUnits: ComputeUnits = .cpuAndNeuralEngine
@Flag(help: "Verbose mode")
var verbose: Bool = false
@Option(help: "Task to perform (transcribe or translate)")
var task: String = "transcribe"
@Option(help: "Language spoken in the audio")
var language: String?
@Option(help: "Temperature to use for sampling")
var temperature: Float = 0
@Option(help: "Temperature to increase on fallbacks during decoding")
var temperatureIncrementOnFallback: Float = 0.2
@Option(help: "Number of times to increase temperature when falling back during decoding")
var temperatureFallbackCount: Int = 5
@Option(help: "Number of candidates when sampling with non-zero temperature")
var bestOf: Int = 5
@Flag(help: "Force initial prompt tokens based on language, task, and timestamp options")
var usePrefillPrompt: Bool = false
@Flag(help: "Use decoder prefill data for faster initial decoding")
var usePrefillCache: Bool = false
@Flag(help: "Skip special tokens in the output")
var skipSpecialTokens: Bool = false
@Flag(help: "Force no timestamps when decoding")
var withoutTimestamps: Bool = false
@Flag(help: "Add timestamps for each word in the output")
var wordTimestamps: Bool = false
@Option(help: "Force prefix text when decoding")
var prefix: String?
@Option(help: "Condition on this text when decoding")
var prompt: String?
@Option(parsing: .upToNextOption, help: "List of timestamps (in seconds) of start and end values to transcribe as seperate clips in single audio file (example: --clip-timestamps 0 10.2 34.5 60.0)")
var clipTimestamps: [Float] = []
@Option(parsing: .upToNextOption, help: "List of tokens to supress in the output (example: --supress-tokens 1 2 3)")
var supressTokens: [Int] = []
@Option(help: "Gzip compression ratio threshold for decoding failure")
var compressionRatioThreshold: Float?
@Option(help: "Average log probability threshold for decoding failure")
var logprobThreshold: Float?
@Option(help: "Log probability threshold for first token decoding failure")
var firstTokenLogProbThreshold: Float?
@Option(help: "Probability threshold to consider a segment as silence")
var noSpeechThreshold: Float?
@Flag(help: "Output a report of the results")
var report: Bool = false
@Option(help: "Directory to save the report")
var reportPath: String = "."
@Flag(help: "Process audio directly from the microphone")
var stream: Bool = false
@Flag(help: "Simulate streaming transcription using the input audio file")
var streamSimulated: Bool = false
@Option(help: "Maximum concurrent inference, might be helpful when processing more than 1 audio file at the same time. 0 means unlimited")
var concurrentWorkerCount: Int = 0
@Option(help: "Chunking strategy for audio processing, `nil` means no chunking, `vad` means using voice activity detection")
var chunkingStrategy: String? = nil
}