diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index 1453cf9370d098..1dbba59f9ef326 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -139,8 +139,8 @@ Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`Speech2TextTokenizer`] should be used for extracting the fbank features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~Speech2TextTokenizer.__call__`] + [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features, padding and conversion + into a tensor of type `torch.FloatTensor`. See [`~Speech2TextFeatureExtractor.__call__`] return_dict (`bool`, *optional*): If set to `True`, the model will return a [`~utils.Seq2SeqLMOutput`] instead of a plain tuple. kwargs: (*optional*) Remaining dictionary of keyword arguments. Keyword arguments come in two flavors: diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index abefc1c1f3cf4e..7c2e1835370a62 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -600,8 +600,8 @@ def _get_feature_vector_attention_mask(self, feature_vector_length, attention_ma Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`Speech2TextTokenizer`] should be used for extracting the fbank features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~Speech2TextTokenizer.__call__`] + [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features, padding and conversion + into a tensor of type `torch.FloatTensor`. See [`~Speech2TextFeatureExtractor.__call__`] attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, 1]`: @@ -733,9 +733,9 @@ def forward( Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into - `input_features`, the [`Speech2TextTokenizer`] should be used for extracting the fbank features, + `input_features`, the [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a tensor of type `torch.FloatTensor`. See - [`~Speech2TextTokenizer.__call__`] + [`~Speech2TextFeatureExtractor.__call__`] attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, 1]`: diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index 7848630314d410..4b698ca1ad52e9 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -650,8 +650,8 @@ def serving(self, inputs): Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`Speech2TextTokenizer`] should be used for extracting the fbank features, padding and conversion into a - tensor of floats. See [`~Speech2TextTokenizer.__call__`] + [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features, padding and conversion + into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`] attention_mask (`tf.Tensor` of shape `({0})`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: @@ -798,8 +798,8 @@ def call( Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into - `input_features`, the [`Speech2TextTokenizer`] should be used for extracting the fbank features, - padding and conversion into a tensor of floats. See [`~Speech2TextTokenizer.__call__`] + `input_features`, the [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features, + padding and conversion into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`] attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: