From ab42b861f051d7bd2af288ffcf81bc05e0f4a2bd Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Sat, 28 May 2022 21:25:20 +0200 Subject: [PATCH] Add hacky --alignmentFile option --- rhubarb/src/lib/rhubarbLib.cpp | 6 ++-- rhubarb/src/lib/rhubarbLib.h | 2 ++ .../src/recognition/PhoneticRecognizer.cpp | 4 +++ rhubarb/src/recognition/PhoneticRecognizer.h | 1 + .../recognition/PocketSphinxRecognizer.cpp | 34 ++++++++++++++++++ .../src/recognition/PocketSphinxRecognizer.h | 1 + rhubarb/src/recognition/Recognizer.h | 1 + rhubarb/src/rhubarb/main.cpp | 35 +++++++++++++++++++ 8 files changed, 82 insertions(+), 2 deletions(-) diff --git a/rhubarb/src/lib/rhubarbLib.cpp b/rhubarb/src/lib/rhubarbLib.cpp index 8b624cb..2ee2a75 100644 --- a/rhubarb/src/lib/rhubarbLib.cpp +++ b/rhubarb/src/lib/rhubarbLib.cpp @@ -11,13 +11,14 @@ using std::filesystem::path; JoiningContinuousTimeline animateAudioClip( const AudioClip& audioClip, const optional& dialog, + const optional>& alignedPhones, const Recognizer& recognizer, const ShapeSet& targetShapeSet, int maxThreadCount, ProgressSink& progressSink) { const BoundedTimeline phones = - recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink); + recognizer.recognizePhones(audioClip, dialog, alignedPhones, maxThreadCount, progressSink); JoiningContinuousTimeline result = animate(phones, targetShapeSet); return result; } @@ -25,11 +26,12 @@ JoiningContinuousTimeline animateAudioClip( JoiningContinuousTimeline animateWaveFile( path filePath, const optional& dialog, + const optional>& alignedPhones, const Recognizer& recognizer, const ShapeSet& targetShapeSet, int maxThreadCount, ProgressSink& progressSink) { const auto audioClip = createAudioFileClip(filePath); - return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink); + return animateAudioClip(*audioClip, dialog, alignedPhones, recognizer, targetShapeSet, maxThreadCount, progressSink); } diff --git a/rhubarb/src/lib/rhubarbLib.h b/rhubarb/src/lib/rhubarbLib.h index 6d7c4d1..f547262 100644 --- a/rhubarb/src/lib/rhubarbLib.h +++ b/rhubarb/src/lib/rhubarbLib.h @@ -11,6 +11,7 @@ JoiningContinuousTimeline animateAudioClip( const AudioClip& audioClip, const boost::optional& dialog, + const boost::optional>& alignedPhones, const Recognizer& recognizer, const ShapeSet& targetShapeSet, int maxThreadCount, @@ -19,6 +20,7 @@ JoiningContinuousTimeline animateAudioClip( JoiningContinuousTimeline animateWaveFile( std::filesystem::path filePath, const boost::optional& dialog, + const boost::optional>& alignedPhones, const Recognizer& recognizer, const ShapeSet& targetShapeSet, int maxThreadCount, diff --git a/rhubarb/src/recognition/PhoneticRecognizer.cpp b/rhubarb/src/recognition/PhoneticRecognizer.cpp index 68fb6fc..67500f1 100644 --- a/rhubarb/src/recognition/PhoneticRecognizer.cpp +++ b/rhubarb/src/recognition/PhoneticRecognizer.cpp @@ -106,8 +106,12 @@ static Timeline utteranceToPhones( BoundedTimeline PhoneticRecognizer::recognizePhones( const AudioClip& inputAudioClip, optional dialog, + optional> alignedPhones, int maxThreadCount, ProgressSink& progressSink ) const { + if (alignedPhones) { + throw std::invalid_argument("Phonetic recognizer doesn't support specifying aligned phones in this POC."); + } return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink); } diff --git a/rhubarb/src/recognition/PhoneticRecognizer.h b/rhubarb/src/recognition/PhoneticRecognizer.h index 96797cf..15ebe01 100644 --- a/rhubarb/src/recognition/PhoneticRecognizer.h +++ b/rhubarb/src/recognition/PhoneticRecognizer.h @@ -8,6 +8,7 @@ public: BoundedTimeline recognizePhones( const AudioClip& inputAudioClip, boost::optional dialog, + boost::optional> alignedPhones, int maxThreadCount, ProgressSink& progressSink ) const override; diff --git a/rhubarb/src/recognition/PocketSphinxRecognizer.cpp b/rhubarb/src/recognition/PocketSphinxRecognizer.cpp index 9eaf5e3..44d1120 100644 --- a/rhubarb/src/recognition/PocketSphinxRecognizer.cpp +++ b/rhubarb/src/recognition/PocketSphinxRecognizer.cpp @@ -6,8 +6,10 @@ #include "languageModels.h" #include "tokenization.h" #include "g2p.h" +#include "audio/DcOffset.h" #include "time/ContinuousTimeline.h" #include "audio/processing.h" +#include "audio/voiceActivityDetection.h" #include "time/timedLogging.h" extern "C" { @@ -334,9 +336,41 @@ static Timeline utteranceToPhones( BoundedTimeline PocketSphinxRecognizer::recognizePhones( const AudioClip& inputAudioClip, optional dialog, + optional> alignedPhones, int maxThreadCount, ProgressSink& progressSink ) const { + if (alignedPhones) { + // Make sure audio stream has no DC offset + const unique_ptr audioClip = inputAudioClip.clone() | removeDcOffset(); + + // Split audio into utterances + JoiningBoundedTimeline utterances; + try { + utterances = detectVoiceActivity(*audioClip, progressSink); + } catch (...) { + std::throw_with_nested(runtime_error("Error detecting segments of speech.")); + } + + BoundedTimeline result(utterances.getRange()); + for (auto& utterance : utterances) { + // Copy over utterance phones + BoundedTimeline utteranceResult(utterance.getTimeRange(), *alignedPhones); + + // Guess positions of noise sounds + const JoiningTimeline noiseSounds = getNoiseSounds(utteranceResult.getRange(), utteranceResult); + for (const auto& noiseSound : noiseSounds) { + utteranceResult.set(noiseSound.getTimeRange(), Phone::Noise); + } + + for (const auto& timedValue : utteranceResult) { + result.set(timedValue); + } + } + + return result; + } + return ::recognizePhones( inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink); } diff --git a/rhubarb/src/recognition/PocketSphinxRecognizer.h b/rhubarb/src/recognition/PocketSphinxRecognizer.h index dc11d2d..3b185d5 100644 --- a/rhubarb/src/recognition/PocketSphinxRecognizer.h +++ b/rhubarb/src/recognition/PocketSphinxRecognizer.h @@ -8,6 +8,7 @@ public: BoundedTimeline recognizePhones( const AudioClip& inputAudioClip, boost::optional dialog, + boost::optional> alignedPhones, int maxThreadCount, ProgressSink& progressSink ) const override; diff --git a/rhubarb/src/recognition/Recognizer.h b/rhubarb/src/recognition/Recognizer.h index 6995f1f..33cdd80 100644 --- a/rhubarb/src/recognition/Recognizer.h +++ b/rhubarb/src/recognition/Recognizer.h @@ -12,6 +12,7 @@ public: virtual BoundedTimeline recognizePhones( const AudioClip& audioClip, boost::optional dialog, + boost::optional> alignedPhones, int maxThreadCount, ProgressSink& progressSink ) const = 0; diff --git a/rhubarb/src/rhubarb/main.cpp b/rhubarb/src/rhubarb/main.cpp index 85eec62..7e4c90f 100644 --- a/rhubarb/src/rhubarb/main.cpp +++ b/rhubarb/src/rhubarb/main.cpp @@ -115,6 +115,33 @@ ShapeSet getTargetShapeSet(const string& extendedShapesString) { return result; } +BoundedTimeline readAlignmentFile(const path& filePath) { + if (!exists(filePath)) { + throw std::invalid_argument(fmt::format("File {} does not exist.", filePath.u8string())); + } + try { + std::ifstream file; + file.exceptions(std::ifstream::failbit | std::ifstream::badbit); + file.open(filePath); + file.exceptions(0); + Timeline result; + while (file) { + double start, end; + Phone phone; + file >> start >> end >> phone; + result.set( + centiseconds(static_cast(start * 100)), + centiseconds(static_cast(end * 100)), + phone + ); + } + + return BoundedTimeline(result.getRange(), result); + } catch (...) { + std::throw_with_nested(std::runtime_error(fmt::format("Error reading file {0}.", filePath.u8string()))); + } +} + int main(int platformArgc, char* platformArgv[]) { // Set up default logging so early errors are printed to stdout const logging::Level defaultMinStderrLevel = logging::Level::Error; @@ -174,6 +201,11 @@ int main(int platformArgc, char* platformArgv[]) { false, "GHX", "string", cmd ); + tclap::ValueArg alignmentFile( + "", "alignmentFile", "A .tsv file containing pre-calculated phoneme alignment data.", + false, string(), "string", cmd + ); + tclap::ValueArg dialogFile( "d", "dialogFile", "A file containing the text of the dialog.", false, string(), "string", cmd @@ -263,6 +295,9 @@ int main(int platformArgc, char* platformArgv[]) { dialogFile.isSet() ? readUtf8File(u8path(dialogFile.getValue())) : boost::optional(), + alignmentFile.isSet() + ? readAlignmentFile(u8path(alignmentFile.getValue())) + : boost::optional>(), *createRecognizer(recognizerType.getValue()), targetShapeSet, maxThreadCount.getValue(),