Skip to content

Commit

Permalink
adjust cut off sec and target transcript
Browse files Browse the repository at this point in the history
  • Loading branch information
pgosar committed May 4, 2024
1 parent 9fb6d94 commit 1a896d2
Showing 1 changed file with 29 additions and 4 deletions.
33 changes: 29 additions & 4 deletions tts_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,13 @@ def parse_arguments():
default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,",
help="original transcript")
parser.add_argument("-tt", "--target_transcript", type=str,
default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,",
default="object was seen as a mirage in the lake in the distance,",
help="target transcript")
parser.add_argument("-co", "--cut_off_sec", type=float, default=3.6,
help="cut off point in seconds for input prompt")
parser.add_argument("-ma", "--margin", type=float, default=0.07,
help="lowest margin in seconds between words for input prompt")

args = parser.parse_args()
return args

Expand Down Expand Up @@ -135,11 +138,33 @@ def parse_arguments():
# if the above fails, it could be because the audio is too hard for the alignment model,
# increasing the beam size usually solves the issue

def find_closest_word_boundary(alignments, cut_off_sec, margin):
with open(alignments, 'r') as file:
# skip header
next(file)
prev_end = 0.0
cutoff_time = None
cutoff_index = None
for i, line in enumerate(file):
end = float(line.strip().split(',')[1])
if end >= cut_off_sec and end - prev_end >= margin:
cutoff_time = end + margin / 2
cutoff_index = i
break

prev_end = end

return cutoff_time, cutoff_index

# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
cut_off_sec = args.cut_off_sec # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio
target_transcript = args.target_transcript
# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.
# NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio
cut_off_sec = args.cut_off_sec
margin = args.margin
audio_fn = f"{temp_folder}/{filename}.wav"
alignments = f"{temp_folder}/mfa_alignments/{filename}.csv"
cut_off_sec, cut_off_word_idx = find_closest_word_boundary(alignments, cut_off_sec, margin)
target_transcript = " ".join(orig_transcript.split(" ")[:cut_off_word_idx]) + " " + args.target_transcript
# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.
info = torchaudio.info(audio_fn)
audio_dur = info.num_frames / info.sample_rate

Expand Down

0 comments on commit 1a896d2

Please sign in to comment.