Add hacky --alignmentFile option

This commit is contained in:
Daniel Wolf 2022-05-28 21:25:20 +02:00
parent 364a5d4fe4
commit ab42b861f0
8 changed files with 82 additions and 2 deletions

View File

@ -11,13 +11,14 @@ using std::filesystem::path;
JoiningContinuousTimeline<Shape> animateAudioClip( JoiningContinuousTimeline<Shape> animateAudioClip(
const AudioClip& audioClip, const AudioClip& audioClip,
const optional<string>& dialog, const optional<string>& dialog,
const optional<BoundedTimeline<Phone>>& alignedPhones,
const Recognizer& recognizer, const Recognizer& recognizer,
const ShapeSet& targetShapeSet, const ShapeSet& targetShapeSet,
int maxThreadCount, int maxThreadCount,
ProgressSink& progressSink) ProgressSink& progressSink)
{ {
const BoundedTimeline<Phone> phones = const BoundedTimeline<Phone> phones =
recognizer.recognizePhones(audioClip, dialog, maxThreadCount, progressSink); recognizer.recognizePhones(audioClip, dialog, alignedPhones, maxThreadCount, progressSink);
JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet); JoiningContinuousTimeline<Shape> result = animate(phones, targetShapeSet);
return result; return result;
} }
@ -25,11 +26,12 @@ JoiningContinuousTimeline<Shape> animateAudioClip(
JoiningContinuousTimeline<Shape> animateWaveFile( JoiningContinuousTimeline<Shape> animateWaveFile(
path filePath, path filePath,
const optional<string>& dialog, const optional<string>& dialog,
const optional<BoundedTimeline<Phone>>& alignedPhones,
const Recognizer& recognizer, const Recognizer& recognizer,
const ShapeSet& targetShapeSet, const ShapeSet& targetShapeSet,
int maxThreadCount, int maxThreadCount,
ProgressSink& progressSink) ProgressSink& progressSink)
{ {
const auto audioClip = createAudioFileClip(filePath); const auto audioClip = createAudioFileClip(filePath);
return animateAudioClip(*audioClip, dialog, recognizer, targetShapeSet, maxThreadCount, progressSink); return animateAudioClip(*audioClip, dialog, alignedPhones, recognizer, targetShapeSet, maxThreadCount, progressSink);
} }

View File

@ -11,6 +11,7 @@
JoiningContinuousTimeline<Shape> animateAudioClip( JoiningContinuousTimeline<Shape> animateAudioClip(
const AudioClip& audioClip, const AudioClip& audioClip,
const boost::optional<std::string>& dialog, const boost::optional<std::string>& dialog,
const boost::optional<BoundedTimeline<Phone>>& alignedPhones,
const Recognizer& recognizer, const Recognizer& recognizer,
const ShapeSet& targetShapeSet, const ShapeSet& targetShapeSet,
int maxThreadCount, int maxThreadCount,
@ -19,6 +20,7 @@ JoiningContinuousTimeline<Shape> animateAudioClip(
JoiningContinuousTimeline<Shape> animateWaveFile( JoiningContinuousTimeline<Shape> animateWaveFile(
std::filesystem::path filePath, std::filesystem::path filePath,
const boost::optional<std::string>& dialog, const boost::optional<std::string>& dialog,
const boost::optional<BoundedTimeline<Phone>>& alignedPhones,
const Recognizer& recognizer, const Recognizer& recognizer,
const ShapeSet& targetShapeSet, const ShapeSet& targetShapeSet,
int maxThreadCount, int maxThreadCount,

View File

@ -106,8 +106,12 @@ static Timeline<Phone> utteranceToPhones(
BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones( BoundedTimeline<Phone> PhoneticRecognizer::recognizePhones(
const AudioClip& inputAudioClip, const AudioClip& inputAudioClip,
optional<std::string> dialog, optional<std::string> dialog,
optional<BoundedTimeline<Phone>> alignedPhones,
int maxThreadCount, int maxThreadCount,
ProgressSink& progressSink ProgressSink& progressSink
) const { ) const {
if (alignedPhones) {
throw std::invalid_argument("Phonetic recognizer doesn't support specifying aligned phones in this POC.");
}
return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink); return ::recognizePhones(inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
} }

View File

@ -8,6 +8,7 @@ public:
BoundedTimeline<Phone> recognizePhones( BoundedTimeline<Phone> recognizePhones(
const AudioClip& inputAudioClip, const AudioClip& inputAudioClip,
boost::optional<std::string> dialog, boost::optional<std::string> dialog,
boost::optional<BoundedTimeline<Phone>> alignedPhones,
int maxThreadCount, int maxThreadCount,
ProgressSink& progressSink ProgressSink& progressSink
) const override; ) const override;

View File

@ -6,8 +6,10 @@
#include "languageModels.h" #include "languageModels.h"
#include "tokenization.h" #include "tokenization.h"
#include "g2p.h" #include "g2p.h"
#include "audio/DcOffset.h"
#include "time/ContinuousTimeline.h" #include "time/ContinuousTimeline.h"
#include "audio/processing.h" #include "audio/processing.h"
#include "audio/voiceActivityDetection.h"
#include "time/timedLogging.h" #include "time/timedLogging.h"
extern "C" { extern "C" {
@ -334,9 +336,41 @@ static Timeline<Phone> utteranceToPhones(
BoundedTimeline<Phone> PocketSphinxRecognizer::recognizePhones( BoundedTimeline<Phone> PocketSphinxRecognizer::recognizePhones(
const AudioClip& inputAudioClip, const AudioClip& inputAudioClip,
optional<std::string> dialog, optional<std::string> dialog,
optional<BoundedTimeline<Phone>> alignedPhones,
int maxThreadCount, int maxThreadCount,
ProgressSink& progressSink ProgressSink& progressSink
) const { ) const {
if (alignedPhones) {
// Make sure audio stream has no DC offset
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDcOffset();
// Split audio into utterances
JoiningBoundedTimeline<void> utterances;
try {
utterances = detectVoiceActivity(*audioClip, progressSink);
} catch (...) {
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
}
BoundedTimeline<Phone> result(utterances.getRange());
for (auto& utterance : utterances) {
// Copy over utterance phones
BoundedTimeline<Phone> utteranceResult(utterance.getTimeRange(), *alignedPhones);
// Guess positions of noise sounds
const JoiningTimeline<void> noiseSounds = getNoiseSounds(utteranceResult.getRange(), utteranceResult);
for (const auto& noiseSound : noiseSounds) {
utteranceResult.set(noiseSound.getTimeRange(), Phone::Noise);
}
for (const auto& timedValue : utteranceResult) {
result.set(timedValue);
}
}
return result;
}
return ::recognizePhones( return ::recognizePhones(
inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink); inputAudioClip, dialog, &createDecoder, &utteranceToPhones, maxThreadCount, progressSink);
} }

View File

@ -8,6 +8,7 @@ public:
BoundedTimeline<Phone> recognizePhones( BoundedTimeline<Phone> recognizePhones(
const AudioClip& inputAudioClip, const AudioClip& inputAudioClip,
boost::optional<std::string> dialog, boost::optional<std::string> dialog,
boost::optional<BoundedTimeline<Phone>> alignedPhones,
int maxThreadCount, int maxThreadCount,
ProgressSink& progressSink ProgressSink& progressSink
) const override; ) const override;

View File

@ -12,6 +12,7 @@ public:
virtual BoundedTimeline<Phone> recognizePhones( virtual BoundedTimeline<Phone> recognizePhones(
const AudioClip& audioClip, const AudioClip& audioClip,
boost::optional<std::string> dialog, boost::optional<std::string> dialog,
boost::optional<BoundedTimeline<Phone>> alignedPhones,
int maxThreadCount, int maxThreadCount,
ProgressSink& progressSink ProgressSink& progressSink
) const = 0; ) const = 0;

View File

@ -115,6 +115,33 @@ ShapeSet getTargetShapeSet(const string& extendedShapesString) {
return result; return result;
} }
BoundedTimeline<Phone> readAlignmentFile(const path& filePath) {
if (!exists(filePath)) {
throw std::invalid_argument(fmt::format("File {} does not exist.", filePath.u8string()));
}
try {
std::ifstream file;
file.exceptions(std::ifstream::failbit | std::ifstream::badbit);
file.open(filePath);
file.exceptions(0);
Timeline<Phone> result;
while (file) {
double start, end;
Phone phone;
file >> start >> end >> phone;
result.set(
centiseconds(static_cast<int>(start * 100)),
centiseconds(static_cast<int>(end * 100)),
phone
);
}
return BoundedTimeline<Phone>(result.getRange(), result);
} catch (...) {
std::throw_with_nested(std::runtime_error(fmt::format("Error reading file {0}.", filePath.u8string())));
}
}
int main(int platformArgc, char* platformArgv[]) { int main(int platformArgc, char* platformArgv[]) {
// Set up default logging so early errors are printed to stdout // Set up default logging so early errors are printed to stdout
const logging::Level defaultMinStderrLevel = logging::Level::Error; const logging::Level defaultMinStderrLevel = logging::Level::Error;
@ -174,6 +201,11 @@ int main(int platformArgc, char* platformArgv[]) {
false, "GHX", "string", cmd false, "GHX", "string", cmd
); );
tclap::ValueArg<string> alignmentFile(
"", "alignmentFile", "A .tsv file containing pre-calculated phoneme alignment data.",
false, string(), "string", cmd
);
tclap::ValueArg<string> dialogFile( tclap::ValueArg<string> dialogFile(
"d", "dialogFile", "A file containing the text of the dialog.", "d", "dialogFile", "A file containing the text of the dialog.",
false, string(), "string", cmd false, string(), "string", cmd
@ -263,6 +295,9 @@ int main(int platformArgc, char* platformArgv[]) {
dialogFile.isSet() dialogFile.isSet()
? readUtf8File(u8path(dialogFile.getValue())) ? readUtf8File(u8path(dialogFile.getValue()))
: boost::optional<string>(), : boost::optional<string>(),
alignmentFile.isSet()
? readAlignmentFile(u8path(alignmentFile.getValue()))
: boost::optional<BoundedTimeline<Phone>>(),
*createRecognizer(recognizerType.getValue()), *createRecognizer(recognizerType.getValue()),
targetShapeSet, targetShapeSet,
maxThreadCount.getValue(), maxThreadCount.getValue(),