Skip to content

Commit

Permalink
AudioSourceFetcherImpl can listen to audio streams.
Browse files Browse the repository at this point in the history
Adds and implements AudioSourceFetcher::Start, and the audio
capture callback which converts audio data into mojom and sends
it to the SpeechRecognitionRecognizer.

Refactors audio conversion work, which is common between
ChromeSpeechRecognitionClient and AudioSourceFetcher, into
AudioDataS16Converter.

For more, see go/cros-dictation-with-soda.

This completes end-to-end functionality for OnDeviceSpeechRecognizer
which can be used by Projector and Dictation to use on-device
speech recognition.

There are some follow-ups needed to pass audio parameters from
OnDeviceSpeechRecognizer to AudioSourceFetcher, left as TODOs.

Bug: 1173135
Change-Id: I460034d780ab07010b7a23c8dab018e6d4754a1f
AX-Relnotes: N/A
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2740120
Commit-Queue: Katie Dektar <katie@chromium.org>
Reviewed-by: Robert Sesek <rsesek@chromium.org>
Reviewed-by: Avi Drissman <avi@chromium.org>
Reviewed-by: Dale Curtis <dalecurtis@chromium.org>
Reviewed-by: Evan Liu <evliu@google.com>
Cr-Commit-Position: refs/heads/master@{#862219}
  • Loading branch information
Katie Dektar authored and Chromium LUCI CQ committed Mar 12, 2021
1 parent 2effd41 commit 9204339
Show file tree
Hide file tree
Showing 15 changed files with 436 additions and 175 deletions.
6 changes: 1 addition & 5 deletions chrome/browser/service_sandbox_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,7 @@ class SpeechRecognitionService;
template <>
inline sandbox::policy::SandboxType
content::GetServiceSandboxType<media::mojom::SpeechRecognitionService>() {
if (base::FeatureList::IsEnabled(media::kUseSodaForLiveCaption)) {
return sandbox::policy::SandboxType::kSpeechRecognition;
} else {
return sandbox::policy::SandboxType::kUtility;
}
return sandbox::policy::SandboxType::kSpeechRecognition;
}
#endif // !defined(OS_ANDROID)

Expand Down
2 changes: 1 addition & 1 deletion chrome/browser/speech/on_device_speech_recognizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ OnDeviceSpeechRecognizer::~OnDeviceSpeechRecognizer() {
}

void OnDeviceSpeechRecognizer::Start() {
// TODO(crbug.com/1173135): Call audio_source_fetcher_->Start();
audio_source_fetcher_->Start();
UpdateStatus(SpeechRecognizerStatus::SPEECH_RECOGNIZER_RECOGNIZING);
}

Expand Down
100 changes: 92 additions & 8 deletions chrome/browser/speech/speech_recognition_service_browsertest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@
#include "base/files/file_util.h"
#include "base/notreached.h"
#include "base/path_service.h"
#include "base/sync_socket.h"
#include "base/test/metrics/histogram_tester.h"
#include "base/test/scoped_feature_list.h"
#include "base/timer/timer.h"
#include "build/build_config.h"
#include "chrome/browser/browser_process.h"
#include "chrome/browser/profiles/profile.h"
#include "chrome/browser/profiles/profile_manager.h"
Expand All @@ -19,12 +22,19 @@
#include "components/prefs/pref_service.h"
#include "components/soda/pref_names.h"
#include "content/public/browser/audio_service.h"
#include "content/public/common/content_switches.h"
#include "content/public/test/browser_test.h"
#include "media/audio/wav_audio_handler.h"
#include "media/base/media_switches.h"
#include "media/mojo/mojom/audio_input_stream.mojom.h"
#include "media/mojo/mojom/audio_stream_factory.mojom.h"
#include "media/mojo/mojom/media_types.mojom.h"
#include "media/mojo/mojom/speech_recognition_service.mojom.h"
#include "sandbox/policy/switches.h"
#include "services/audio/public/cpp/fake_stream_factory.h"
#include "testing/gmock/include/gmock/gmock.h"

using testing::StrictMock;

namespace speech {

Expand All @@ -42,6 +52,72 @@ constexpr int kExpectedChannelCount = 1;
constexpr base::FilePath::CharType kSodaBinaryRelativePath[] =
FILE_PATH_LITERAL("libsoda_for_testing.so");

// TODO: Should be a way to generate this, this seems way too brittle.
const size_t kShMemSize = 82240;

class MockStream : public media::mojom::AudioInputStream {
public:
MOCK_METHOD0(Record, void());
MOCK_METHOD1(SetVolume, void(double));
};

class TestStreamFactory : public audio::FakeStreamFactory {
public:
TestStreamFactory() : stream_(), stream_receiver_(&stream_) {}
~TestStreamFactory() override = default;
void CreateInputStream(
mojo::PendingReceiver<media::mojom::AudioInputStream> stream_receiver,
mojo::PendingRemote<media::mojom::AudioInputStreamClient> client,
mojo::PendingRemote<media::mojom::AudioInputStreamObserver> observer,
mojo::PendingRemote<media::mojom::AudioLog> log,
const std::string& device_id,
const media::AudioParameters& params,
uint32_t shared_memory_count,
bool enable_agc,
base::ReadOnlySharedMemoryRegion key_press_count_buffer,
CreateInputStreamCallback created_callback) {
if (stream_receiver_.is_bound())
stream_receiver_.reset();
stream_receiver_.Bind(std::move(stream_receiver));
if (client_)
client_.reset();
// Keep the passed client alive to avoid binding errors.
client_.Bind(std::move(client));
base::SyncSocket socket1, socket2;
base::SyncSocket::CreatePair(&socket1, &socket2);
std::move(created_callback)
.Run({base::in_place,
base::ReadOnlySharedMemoryRegion::Create(kShMemSize).region,
mojo::PlatformHandle(socket1.Take())},
false /*initially muted*/, base::UnguessableToken::Create());
}

mojo::PendingRemote<media::mojom::AudioStreamFactory> MakeRemote() {
return receiver_.BindNewPipeAndPassRemote();
}

void WaitToCreateInputStream() {
if (stream_receiver_.is_bound())
return;
base::RepeatingTimer check_timer;
check_timer.Start(FROM_HERE, base::TimeDelta::FromMilliseconds(10), this,
&TestStreamFactory::OnTimer);
runner_.Run();
}

StrictMock<MockStream> stream_;
mojo::Remote<media::mojom::AudioInputStreamClient> client_;
mojo::Receiver<media::mojom::AudioInputStream> stream_receiver_;

private:
void OnTimer() {
if (stream_receiver_.is_bound())
runner_.Quit();
}

base::RunLoop runner_;
};

class SpeechRecognitionServiceTest
: public InProcessBrowserTest,
public media::mojom::SpeechRecognitionRecognizerClient {
Expand Down Expand Up @@ -85,6 +161,8 @@ class SpeechRecognitionServiceTest
mojo::Receiver<media::mojom::SpeechRecognitionRecognizerClient>
speech_recognition_client_receiver_{this};

std::unique_ptr<StrictMock<TestStreamFactory>> stream_factory_;

std::vector<std::string> recognition_results_;

DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionServiceTest);
Expand Down Expand Up @@ -138,6 +216,9 @@ void SpeechRecognitionServiceTest::LaunchService() {
}

void SpeechRecognitionServiceTest::LaunchServiceWithAudioSourceFetcher() {
// Create a fake stream factory.
stream_factory_ = std::make_unique<StrictMock<TestStreamFactory>>();

// Launch the Speech Recognition service.
auto* browser_context =
static_cast<content::BrowserContext*>(browser()->profile());
Expand All @@ -148,17 +229,13 @@ void SpeechRecognitionServiceTest::LaunchServiceWithAudioSourceFetcher() {
speech_recognition_context_.BindNewPipeAndPassReceiver();
service->Create(std::move(speech_recognition_context_receiver));

mojo::PendingRemote<media::mojom::AudioStreamFactory> stream_factory;
content::GetAudioServiceStreamFactoryBinder().Run(
stream_factory.InitWithNewPipeAndPassReceiver());

bool success = false;
auto run_loop = std::make_unique<base::RunLoop>();
// Bind the recognizer pipes used to send audio and receive results.
speech_recognition_context_->BindAudioSourceFetcher(
audio_source_fetcher_.BindNewPipeAndPassReceiver(),
speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
std::move(stream_factory),
stream_factory_->MakeRemote(),
base::BindOnce(
[](bool* p_success, base::RunLoop* run_loop, bool success) {
*p_success = success;
Expand Down Expand Up @@ -270,11 +347,18 @@ IN_PROC_BROWSER_TEST_F(SpeechRecognitionServiceTest, CreateAudioSourceFetcher) {
profile_prefs->SetBoolean(prefs::kLiveCaptionEnabled, true);
LaunchServiceWithAudioSourceFetcher();

// Ensure no crashes.
// TODO(crbug.com/1185978): Check implementation / sandbox policy on Mac and
// Windows.
#if defined(OS_CHROMEOS) || defined(OS_LINUX)
// Check that Start begins audio recording.
// TODO(crbug.com/1173135): Try to mock audio input, maybe with
// FakeStreamFactory, to test end-to-end.
audio_source_fetcher_->Stop();
// TestStreamFactory::stream_, to test end-to-end.
EXPECT_CALL(stream_factory_->stream_, Record());
audio_source_fetcher_->Start();
stream_factory_->WaitToCreateInputStream();
#endif

audio_source_fetcher_->Stop();
base::RunLoop().RunUntilIdle();
}

Expand Down
1 change: 1 addition & 0 deletions chrome/renderer/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ static_library("renderer") {
"//media",
"//media:media_buildflags",
"//media/capture",
"//media/mojo/common",
"//mojo/public/cpp/bindings",
"//net",
"//pdf:buildflags",
Expand Down
115 changes: 5 additions & 110 deletions chrome/renderer/media/chrome_speech_recognition_client.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,18 @@ ChromeSpeechRecognitionClient::~ChromeSpeechRecognitionClient() = default;
void ChromeSpeechRecognitionClient::AddAudio(
scoped_refptr<media::AudioBuffer> buffer) {
DCHECK(buffer);
send_audio_callback_.Run(ConvertToAudioDataS16(std::move(buffer)));
send_audio_callback_.Run(
ConvertToAudioDataS16(std::move(buffer), is_multichannel_supported_));
}

void ChromeSpeechRecognitionClient::AddAudio(
std::unique_ptr<media::AudioBus> audio_bus,
int sample_rate,
media::ChannelLayout channel_layout) {
DCHECK(audio_bus);
send_audio_callback_.Run(
ConvertToAudioDataS16(std::move(audio_bus), sample_rate, channel_layout));
send_audio_callback_.Run(ConvertToAudioDataS16(std::move(audio_bus),
sample_rate, channel_layout,
is_multichannel_supported_));
}

bool ChromeSpeechRecognitionClient::IsSpeechRecognitionAvailable() {
Expand Down Expand Up @@ -179,84 +181,6 @@ void ChromeSpeechRecognitionClient::SendAudioToSpeechRecognitionService(
}
}

media::mojom::AudioDataS16Ptr
ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
scoped_refptr<media::AudioBuffer> buffer) {
DCHECK_GT(buffer->frame_count(), 0);
DCHECK_GT(buffer->channel_count(), 0);
DCHECK_GT(buffer->sample_rate(), 0);

auto signed_buffer = media::mojom::AudioDataS16::New();
signed_buffer->channel_count = buffer->channel_count();
signed_buffer->frame_count = buffer->frame_count();
signed_buffer->sample_rate = buffer->sample_rate();

// If multichannel audio is not supported by the speech recognition service,
// mix the channels into a monaural channel before converting it.
if (buffer->channel_count() > 1 && !is_multichannel_supported_) {
signed_buffer->channel_count = 1;
CopyBufferToTempAudioBus(*buffer);
ResetChannelMixer(buffer->frame_count(), buffer->channel_layout());
signed_buffer->data.resize(buffer->frame_count());
channel_mixer_->Transform(temp_audio_bus_.get(), monaural_audio_bus_.get());
monaural_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
monaural_audio_bus_->frames(), &signed_buffer->data[0]);
return signed_buffer;
}

// If the audio is already in the interleaved signed int 16 format, directly
// assign it to the buffer.
if (buffer->sample_format() == media::SampleFormat::kSampleFormatS16) {
int16_t* audio_data = reinterpret_cast<int16_t*>(buffer->channel_data()[0]);
signed_buffer->data.assign(
audio_data,
audio_data + buffer->frame_count() * buffer->channel_count());
return signed_buffer;
}

// Convert the raw audio to the interleaved signed int 16 sample type.
CopyBufferToTempAudioBus(*buffer);
signed_buffer->data.resize(buffer->frame_count() * buffer->channel_count());
temp_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
temp_audio_bus_->frames(), &signed_buffer->data[0]);

return signed_buffer;
}

media::mojom::AudioDataS16Ptr
ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
std::unique_ptr<media::AudioBus> audio_bus,
int sample_rate,
media::ChannelLayout channel_layout) {
DCHECK_GT(audio_bus->frames(), 0);
DCHECK_GT(audio_bus->channels(), 0);

auto signed_buffer = media::mojom::AudioDataS16::New();
signed_buffer->channel_count = audio_bus->channels();
signed_buffer->frame_count = audio_bus->frames();
signed_buffer->sample_rate = sample_rate;

// If multichannel audio is not supported by the speech recognition service,
// mix the channels into a monaural channel before converting it.
if (audio_bus->channels() > 1 && !is_multichannel_supported_) {
signed_buffer->channel_count = 1;
ResetChannelMixer(audio_bus->frames(), channel_layout);
signed_buffer->data.resize(audio_bus->frames());

channel_mixer_->Transform(audio_bus.get(), monaural_audio_bus_.get());
monaural_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
monaural_audio_bus_->frames(), &signed_buffer->data[0]);

return signed_buffer;
}

signed_buffer->data.resize(audio_bus->frames() * audio_bus->channels());
audio_bus->ToInterleaved<media::SignedInt16SampleTypeTraits>(
audio_bus->frames(), &signed_buffer->data[0]);

return signed_buffer;
}

void ChromeSpeechRecognitionClient::OnTranscriptionCallback(bool success) {
if (!success && is_browser_requesting_transcription_) {
speech_recognition_recognizer_->OnCaptionBubbleClosed();
Expand All @@ -265,35 +189,6 @@ void ChromeSpeechRecognitionClient::OnTranscriptionCallback(bool success) {
is_browser_requesting_transcription_ = success;
}

void ChromeSpeechRecognitionClient::CopyBufferToTempAudioBus(
const media::AudioBuffer& buffer) {
if (!temp_audio_bus_ ||
buffer.channel_count() != temp_audio_bus_->channels() ||
buffer.frame_count() != temp_audio_bus_->frames()) {
temp_audio_bus_ =
media::AudioBus::Create(buffer.channel_count(), buffer.frame_count());
}

buffer.ReadFrames(buffer.frame_count(),
/* source_frame_offset */ 0, /* dest_frame_offset */ 0,
temp_audio_bus_.get());
}

void ChromeSpeechRecognitionClient::ResetChannelMixer(
int frame_count,
media::ChannelLayout channel_layout) {
if (!monaural_audio_bus_ || frame_count != monaural_audio_bus_->frames()) {
monaural_audio_bus_ =
media::AudioBus::Create(1 /* channels */, frame_count);
}

if (channel_layout != channel_layout_) {
channel_layout_ = channel_layout;
channel_mixer_ = std::make_unique<media::ChannelMixer>(
channel_layout, media::CHANNEL_LAYOUT_MONO);
}
}

bool ChromeSpeechRecognitionClient::IsUrlBlocked(const std::string& url) const {
return blocked_urls_.find(url) != blocked_urls_.end();
}
Expand Down
Loading

0 comments on commit 9204339

Please sign in to comment.