From ab42b861f051d7bd2af288ffcf81bc05e0f4a2bd Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Sat, 28 May 2022 21:25:20 +0200
Subject: [PATCH] Add hacky --alignmentFile option

---
 rhubarb/src/lib/rhubarbLib.cpp                |  6 ++--
 rhubarb/src/lib/rhubarbLib.h                  |  2 ++
 .../src/recognition/PhoneticRecognizer.cpp    |  4 +++
 rhubarb/src/recognition/PhoneticRecognizer.h  |  1 +
 .../recognition/PocketSphinxRecognizer.cpp    | 34 ++++++++++++++++++
 .../src/recognition/PocketSphinxRecognizer.h  |  1 +
 rhubarb/src/recognition/Recognizer.h          |  1 +
 rhubarb/src/rhubarb/main.cpp                  | 35 +++++++++++++++++++
 8 files changed, 82 insertions(+), 2 deletions(-)
diff --git a/rhubarb/src/lib/rhubarbLib.cpp b/rhubarb/src/lib/rhubarbLib.cpp
index 8b624cb..2ee2a75 100644
--- a/rhubarb/src/lib/rhubarbLib.cpp
+++ b/rhubarb/src/lib/rhubarbLib.cpp
@@ -11,13 +11,14 @@ using std::filesystem::path;
 JoiningContinuousTimeline<Shape> animateAudioClip(
 	const AudioClip& audioClip,
 	const optional<string>& dialog,
+	const optional<BoundedTimeline<Phone>>& alignedPhones,
 	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
 	ProgressSink& progressSink)
 {
 	const BoundedTimeline<Phone> phones =
-		recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink);
+		recognizer.recognizePhones(audioClip, dialog, alignedPhones, maxThreadCount, progressSink);
 	JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet);
 	return result;
 }
@@ -25,11 +26,12 @@ JoiningContinuousTimeline<Shape> animateAudioClip(
 JoiningContinuousTimeline<Shape> animateWaveFile(
 	path filePath,
 	const optional<string>& dialog,
+	const optional<BoundedTimeline<Phone>>& alignedPhones,
 	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
 	ProgressSink& progressSink)
 {
 	const auto audioClip = createAudioFileClip(filePath);
-	return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink);
+	return animateAudioClip(*audioClip, dialog, alignedPhones, recognizer, targetShapeSet, maxThreadCount, progressSink);
 }
diff --git a/rhubarb/src/lib/rhubarbLib.h b/rhubarb/src/lib/rhubarbLib.h
index 6d7c4d1..f547262 100644
--- a/rhubarb/src/lib/rhubarbLib.h
+++ b/rhubarb/src/lib/rhubarbLib.h
@@ -11,6 +11,7 @@
 JoiningContinuousTimeline<Shape> animateAudioClip(
 	const AudioClip& audioClip,
 	const boost::optional<std::string>& dialog,
+	const boost::optional<BoundedTimeline<Phone>>& alignedPhones,
 	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
@@ -19,6 +20,7 @@ JoiningContinuousTimeline<Shape> animateAudioClip(
 JoiningContinuousTimeline<Shape> animateWaveFile(
 	std::filesystem::path filePath,
 	const boost::optional<std::string>& dialog,
+	const boost::optional<BoundedTimeline<Phone>>& alignedPhones,
 	const Recognizer& recognizer,
 	const ShapeSet& targetShapeSet,
 	int maxThreadCount,
diff --git a/rhubarb/src/recognition/PhoneticRecognizer.cpp b/rhubarb/src/recognition/PhoneticRecognizer.cpp
index 68fb6fc..67500f1 100644
--- a/rhubarb/src/recognition/PhoneticRecognizer.cpp
+++ b/rhubarb/src/recognition/PhoneticRecognizer.cpp
@@ -106,8 +106,12 @@ static Timeline<Phone> utteranceToPhones(
 BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones(
 	const AudioClip& inputAudioClip,
 	optional<std::string> dialog,
+	optional<BoundedTimeline<Phone>> alignedPhones,
 	int maxThreadCount,
 	ProgressSink& progressSink
 ) const {
+	if (alignedPhones) {
+		throw std::invalid_argument("Phonetic recognizer doesn't support specifying aligned phones in this POC.");
+	}
 	return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
 }
diff --git a/rhubarb/src/recognition/PhoneticRecognizer.h b/rhubarb/src/recognition/PhoneticRecognizer.h
index 96797cf..15ebe01 100644
--- a/rhubarb/src/recognition/PhoneticRecognizer.h
+++ b/rhubarb/src/recognition/PhoneticRecognizer.h
@@ -8,6 +8,7 @@ public:
 	BoundedTimeline<Phone> recognizePhones(
 		const AudioClip& inputAudioClip,
 		boost::optional<std::string> dialog,
+		boost::optional<BoundedTimeline<Phone>> alignedPhones,
 		int maxThreadCount,
 		ProgressSink& progressSink
 	) const override;
diff --git a/rhubarb/src/recognition/PocketSphinxRecognizer.cpp b/rhubarb/src/recognition/PocketSphinxRecognizer.cpp
index 9eaf5e3..44d1120 100644
--- a/rhubarb/src/recognition/PocketSphinxRecognizer.cpp
+++ b/rhubarb/src/recognition/PocketSphinxRecognizer.cpp
@@ -6,8 +6,10 @@
 #include "languageModels.h"
 #include "tokenization.h"
 #include "g2p.h"
+#include "audio/DcOffset.h"
 #include "time/ContinuousTimeline.h"
 #include "audio/processing.h"
+#include "audio/voiceActivityDetection.h"
 #include "time/timedLogging.h"
 
 extern "C" {
@@ -334,9 +336,41 @@ static Timeline<Phone> utteranceToPhones(
 BoundedTimeline<Phone> PocketSphinxRecognizer::recognizePhones(
 	const AudioClip& inputAudioClip,
 	optional<std::string> dialog,
+	optional<BoundedTimeline<Phone>> alignedPhones,
 	int maxThreadCount,
 	ProgressSink& progressSink
 ) const {
+	if (alignedPhones) {
+		// Make sure audio stream has no DC offset
+		const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
+
+		// Split audio into utterances
+		JoiningBoundedTimeline<void> utterances;
+		try {
+			utterances = detectVoiceActivity(*audioClip, progressSink);
+		} catch (...) {
+			std::throw_with_nested(runtime_error("Error detecting segments of speech."));
+		}
+
+		BoundedTimeline<Phone> result(utterances.getRange());
+		for (auto& utterance : utterances) {
+			// Copy over utterance phones
+			BoundedTimeline<Phone> utteranceResult(utterance.getTimeRange(), *alignedPhones);
+
+			// Guess positions of noise sounds
+			const JoiningTimeline<void> noiseSounds = getNoiseSounds(utteranceResult.getRange(), utteranceResult);
+			for (const auto& noiseSound : noiseSounds) {
+				utteranceResult.set(noiseSound.getTimeRange(), Phone::Noise);
+			}
+
+			for (const auto& timedValue : utteranceResult) {
+				result.set(timedValue);
+			}
+		}
+
+		return result;
+	}
+
 	return ::recognizePhones(
 		inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
 }
diff --git a/rhubarb/src/recognition/PocketSphinxRecognizer.h b/rhubarb/src/recognition/PocketSphinxRecognizer.h
index dc11d2d..3b185d5 100644
--- a/rhubarb/src/recognition/PocketSphinxRecognizer.h
+++ b/rhubarb/src/recognition/PocketSphinxRecognizer.h
@@ -8,6 +8,7 @@ public:
 	BoundedTimeline<Phone> recognizePhones(
 		const AudioClip& inputAudioClip,
 		boost::optional<std::string> dialog,
+		boost::optional<BoundedTimeline<Phone>> alignedPhones,
 		int maxThreadCount,
 		ProgressSink& progressSink
 	) const override;
diff --git a/rhubarb/src/recognition/Recognizer.h b/rhubarb/src/recognition/Recognizer.h
index 6995f1f..33cdd80 100644
--- a/rhubarb/src/recognition/Recognizer.h
+++ b/rhubarb/src/recognition/Recognizer.h
@@ -12,6 +12,7 @@ public:
 	virtual BoundedTimeline<Phone> recognizePhones(
 		const AudioClip& audioClip,
 		boost::optional<std::string> dialog,
+		boost::optional<BoundedTimeline<Phone>> alignedPhones,
 		int maxThreadCount,
 		ProgressSink& progressSink
 	) const = 0;
diff --git a/rhubarb/src/rhubarb/main.cpp b/rhubarb/src/rhubarb/main.cpp
index 85eec62..7e4c90f 100644
--- a/rhubarb/src/rhubarb/main.cpp
+++ b/rhubarb/src/rhubarb/main.cpp
@@ -115,6 +115,33 @@ ShapeSet getTargetShapeSet(const string& extendedShapesString) {
 	return result;
 }
 
+BoundedTimeline<Phone> readAlignmentFile(const path& filePath) {
+	if (!exists(filePath)) {
+		throw std::invalid_argument(fmt::format("File {} does not exist.", filePath.u8string()));
+	}
+	try {
+		std::ifstream file;
+		file.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+		file.open(filePath);
+		file.exceptions(0);
+		Timeline<Phone> result;
+		while (file) {
+			double start, end;
+			Phone phone;
+			file >> start >> end >> phone;
+			result.set(
+				centiseconds(static_cast<int>(start * 100)),
+				centiseconds(static_cast<int>(end * 100)),
+				phone
+			);
+		}
+
+		return BoundedTimeline<Phone>(result.getRange(), result);
+	} catch (...) {
+		std::throw_with_nested(std::runtime_error(fmt::format("Error reading file {0}.", filePath.u8string())));
+	}
+}
+
 int main(int platformArgc, char* platformArgv[]) {
 	// Set up default logging so early errors are printed to stdout
 	const logging::Level defaultMinStderrLevel = logging::Level::Error;
@@ -174,6 +201,11 @@ int main(int platformArgc, char* platformArgv[]) {
 		false, "GHX", "string", cmd
 	);
 
+	tclap::ValueArg<string> alignmentFile(
+		"", "alignmentFile", "A .tsv file containing pre-calculated phoneme alignment data.",
+		false, string(), "string", cmd
+	);
+
 	tclap::ValueArg<string> dialogFile(
 		"d", "dialogFile", "A file containing the text of the dialog.",
 		false, string(), "string", cmd
@@ -263,6 +295,9 @@ int main(int platformArgc, char* platformArgv[]) {
 				dialogFile.isSet()
 					? readUtf8File(u8path(dialogFile.getValue()))
 					: boost::optional<string>(),
+				alignmentFile.isSet()
+					? readAlignmentFile(u8path(alignmentFile.getValue()))
+					: boost::optional<BoundedTimeline<Phone>>(),
 				*createRecognizer(recognizerType.getValue()),
 				targetShapeSet,
 				maxThreadCount.getValue(),