Gracefully handling failed audio alignment

This commit is contained in:
Daniel Wolf 2016-06-21 19:20:27 +02:00
parent 944c374415
commit 0e00e58d91
1 changed files with 25 additions and 20 deletions

View File

@ -16,6 +16,7 @@
#include "languageModels.h" #include "languageModels.h"
#include "tokenization.h" #include "tokenization.h"
#include "g2p.h" #include "g2p.h"
#include "ContinuousTimeline.h"
extern "C" { extern "C" {
#include <pocketsphinx.h> #include <pocketsphinx.h>
@ -184,7 +185,7 @@ s3wid_t getWordId(const string& word, dict_t& dictionary) {
return wordId; return wordId;
} }
BoundedTimeline<Phone> getPhoneAlignment( optional<BoundedTimeline<Phone>> getPhoneAlignment(
const vector<s3wid_t>& wordIds, const vector<s3wid_t>& wordIds,
unique_ptr<AudioStream> audioStream, unique_ptr<AudioStream> audioStream,
ps_decoder_t& decoder, ps_decoder_t& decoder,
@ -216,27 +217,30 @@ BoundedTimeline<Phone> getPhoneAlignment(
error = acmod_start_utt(acousticModel); error = acmod_start_utt(acousticModel);
if (error) throw runtime_error("Error starting utterance processing for alignment."); if (error) throw runtime_error("Error starting utterance processing for alignment.");
// Start search {
ps_search_start(search.get()); // Eventually end recognition
auto endRecognition = gsl::finally([&]() { acmod_end_utt(acousticModel); });
// Process entire sound stream // Start search
auto processBuffer = [&](const vector<int16_t>& buffer) { ps_search_start(search.get());
const int16* nextSample = buffer.data();
size_t remainingSamples = buffer.size(); // Process entire sound stream
while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) { auto processBuffer = [&](const vector<int16_t>& buffer) {
while (acousticModel->n_feat_frame > 0) { const int16* nextSample = buffer.data();
ps_search_step(search.get(), acousticModel->output_frame); size_t remainingSamples = buffer.size();
acmod_advance(acousticModel); while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
while (acousticModel->n_feat_frame > 0) {
ps_search_step(search.get(), acousticModel->output_frame);
acmod_advance(acousticModel);
}
} }
} };
}; processAudioStream(*audioStream.get(), processBuffer, progressSink);
processAudioStream(*audioStream.get(), processBuffer, progressSink);
// End search // End search
ps_search_finish(search.get()); error = ps_search_finish(search.get());
if (error) return boost::none;
// End recognition }
acmod_end_utt(acousticModel);
// Extract phones with timestamps // Extract phones with timestamps
char** phoneNames = decoder.dict->mdef->ciname; char** phoneNames = decoder.dict->mdef->ciname;
@ -356,7 +360,8 @@ BoundedTimeline<Phone> detectPhones(
if (wordIds.empty()) continue; if (wordIds.empty()) continue;
// Align the words' phones with speech // Align the words' phones with speech
BoundedTimeline<Phone> segmentPhones = getPhoneAlignment(wordIds, std::move(streamSegment), *decoder.get(), alignmentProgressSink); BoundedTimeline<Phone> segmentPhones = getPhoneAlignment(wordIds, std::move(streamSegment), *decoder.get(), alignmentProgressSink)
.value_or(ContinuousTimeline<Phone>(streamSegment->getTruncatedRange(), Phone::Unknown));
segmentPhones.shift(timedUtterance.getStart()); segmentPhones.shift(timedUtterance.getStart());
for (const auto& timedPhone : segmentPhones) { for (const auto& timedPhone : segmentPhones) {
logging::logTimedEvent("phone", timedPhone); logging::logTimedEvent("phone", timedPhone);