Restored dialog option, this time based on language model

This approach should be more robust and error-tolerant.
2016-06-03 21:07:49 +02:00 · 2016-06-03 21:07:49 +02:00 · 0d488e8de2
parent 4ed5908627
commit 0d488e8de2
11 changed files with 357 additions and 68 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -64,7 +64,7 @@ target_compile_options(cppFormat PRIVATE ${disableWarningsFlags})
 set_target_properties(cppFormat PROPERTIES FOLDER lib)

 # ... sphinxbase
-include_directories(SYSTEM "lib/sphinxbase-5prealpha-2015-08-05/include")
+include_directories(SYSTEM "lib/sphinxbase-5prealpha-2015-08-05/include" "lib/sphinxbase-5prealpha-2015-08-05/src")
 FILE(GLOB_RECURSE sphinxbaseFiles "lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/*.c")
 add_library(sphinxbase ${sphinxbaseFiles})
 target_compile_options(sphinxbase PRIVATE ${disableWarningsFlags})
@ -192,6 +192,8 @@ set(SOURCE_FILES
 	src/Exporter.cpp src/Exporter.h
 	src/tokenization.cpp src/tokenization.h
 	src/g2p.cpp src/g2p.h
+	src/languageModels.cpp src/languageModels.h
+	src/tupleHash.h
 )
 add_executable(rhubarb ${SOURCE_FILES})
 target_link_libraries(rhubarb ${Boost_LIBRARIES} cppFormat sphinxbase pocketSphinx flite)
--- a/src/languageModels.cpp
+++ b/src/languageModels.cpp
@ -0,0 +1,183 @@
+#include "languageModels.h"
+#include <boost/range/adaptor/map.hpp>
+#include <vector>
+#include <regex>
+#include <map>
+#include <tuple>
+#include "platformTools.h"
+#include <boost/filesystem/fstream.hpp>
+#include "appInfo.h"
+#include <cmath>
+#include <gsl_util.h>
+
+using std::string;
+using std::u32string;
+using std::vector;
+using std::regex;
+using std::map;
+using std::tuple;
+using std::make_tuple;
+using std::get;
+using std::endl;
+using boost::filesystem::path;
+
+using unigram_t = string;
+using bigram_t = tuple<string, string>;
+using trigram_t = tuple<string, string, string>;
+
+map<unigram_t, int> getUnigramCounts(const vector<string>& words) {
+	map<unigram_t, int> unigramCounts;
+	for (const unigram_t& unigram : words) {
+		++unigramCounts[unigram];
+	}
+	return unigramCounts;
+}
+
+map<bigram_t, int> getBigramCounts(const vector<string>& words) {
+	map<bigram_t, int> bigramCounts;
+	for (auto it = words.begin(); it < words.end() - 1; ++it) {
+		++bigramCounts[bigram_t(*it, *(it + 1))];
+	}
+	return bigramCounts;
+}
+
+map<trigram_t, int> getTrigramCounts(const vector<string>& words) {
+	map<trigram_t, int> trigramCounts;
+	if (words.size() >= 3) {
+		for (auto it = words.begin(); it < words.end() - 2; ++it) {
+			++trigramCounts[trigram_t(*it, *(it + 1), *(it + 2))];
+		}
+	}
+	return trigramCounts;
+}
+
+map<unigram_t, double> getUnigramProbabilities(const vector<string>& words, const map<unigram_t, int>& unigramCounts, const double deflator) {
+	map<unigram_t, double> unigramProbabilities;
+	for (const auto& pair : unigramCounts) {
+		unigram_t unigram = get<0>(pair);
+		int unigramCount = get<1>(pair);
+		unigramProbabilities[unigram] = double(unigramCount) / words.size() * deflator;
+	}
+	return unigramProbabilities;
+}
+
+map<bigram_t, double> getBigramProbabilities(const map<unigram_t, int>& unigramCounts, const map<bigram_t, int>& bigramCounts, const double deflator) {
+	map<bigram_t, double> bigramProbabilities;
+	for (const auto& pair : bigramCounts) {
+		bigram_t bigram = get<0>(pair);
+		int bigramCount = get<1>(pair);
+		int unigramPrefixCount = unigramCounts.at(get<0>(bigram));
+		bigramProbabilities[bigram] = double(bigramCount) / unigramPrefixCount * deflator;
+	}
+	return bigramProbabilities;
+}
+
+map<trigram_t, double> getTrigramProbabilities(const map<bigram_t, int>& bigramCounts, const map<trigram_t, int>& trigramCounts, const double deflator) {
+	map<trigram_t, double> trigramProbabilities;
+	for (const auto& pair : trigramCounts) {
+		trigram_t trigram = get<0>(pair);
+		int trigramCount = get<1>(pair);
+		int bigramPrefixCount = bigramCounts.at(bigram_t(get<0>(trigram), get<1>(trigram)));
+		trigramProbabilities[trigram] = double(trigramCount) / bigramPrefixCount * deflator;
+	}
+	return trigramProbabilities;
+}
+
+map<unigram_t, double> getUnigramBackoffWeights(
+	const map<unigram_t, int>& unigramCounts,
+	const map<unigram_t, double>& unigramProbabilities,
+	const map<bigram_t, int>& bigramCounts,
+	const double discountMass)
+{
+	map<unigram_t, double> unigramBackoffWeights;
+	for (const unigram_t& unigram : unigramCounts | boost::adaptors::map_keys) {
+		double denominator = 1;
+		for (const bigram_t& bigram : bigramCounts | boost::adaptors::map_keys) {
+			if (get<0>(bigram) == unigram) {
+				denominator -= unigramProbabilities.at(get<1>(bigram));
+			}
+		}
+		unigramBackoffWeights[unigram] = discountMass / denominator;
+	}
+	return unigramBackoffWeights;
+}
+
+map<bigram_t, double> getBigramBackoffWeights(
+	const map<bigram_t, int>& bigramCounts,
+	const map<bigram_t, double>& bigramProbabilities,
+	const map<trigram_t, int>& trigramCounts,
+	const double discountMass)
+{
+	map<bigram_t, double> bigramBackoffWeights;
+	for (const bigram_t& bigram : bigramCounts | boost::adaptors::map_keys) {
+		double denominator = 1;
+		for (const trigram_t& trigram : trigramCounts | boost::adaptors::map_keys) {
+			if (bigram_t(get<0>(trigram), get<1>(trigram)) == bigram) {
+				denominator -= bigramProbabilities.at(bigram_t(get<1>(trigram), get<2>(trigram)));
+			}
+		}
+		bigramBackoffWeights[bigram] = discountMass / denominator;
+	}
+	return bigramBackoffWeights;
+}
+
+void createLanguageModelFile(const vector<string>& words, path filePath) {
+	const double discountMass = 0.5;
+	const double deflator = 1.0 - discountMass;
+
+	map<unigram_t, int> unigramCounts = getUnigramCounts(words);
+	map<bigram_t, int> bigramCounts = getBigramCounts(words);
+	map<trigram_t, int> trigramCounts = getTrigramCounts(words);
+
+	map<unigram_t, double> unigramProbabilities = getUnigramProbabilities(words, unigramCounts, deflator);
+	map<bigram_t, double> bigramProbabilities = getBigramProbabilities(unigramCounts, bigramCounts, deflator);
+	map<trigram_t, double> trigramProbabilities = getTrigramProbabilities(bigramCounts, trigramCounts, deflator);
+
+	map<unigram_t, double> unigramBackoffWeights = getUnigramBackoffWeights(unigramCounts, unigramProbabilities, bigramCounts, discountMass);
+	map<bigram_t, double> bigramBackoffWeights = getBigramBackoffWeights(bigramCounts, bigramProbabilities, trigramCounts, discountMass);
+
+	boost::filesystem::ofstream file(filePath);
+	file << "Generated by " << appName << " " << appVersion << endl << endl;
+
+	file << "\\data\\" << endl;
+	file << "ngram 1=" << unigramCounts.size() << endl;
+	file << "ngram 2=" << bigramCounts.size() << endl;
+	file << "ngram 3=" << trigramCounts.size() << endl << endl;
+
+	file.setf(std::ios::fixed, std::ios::floatfield);
+	file.precision(4);
+	file << "\\1-grams:" << endl;
+	for (const unigram_t& unigram : unigramCounts | boost::adaptors::map_keys) {
+		file << log10(unigramProbabilities.at(unigram))
+			<< " " << unigram
+			<< " " << log10(unigramBackoffWeights.at(unigram)) << endl;
+	}
+	file << endl;
+
+	file << "\\2-grams:" << endl;
+	for (const bigram_t& bigram : bigramCounts | boost::adaptors::map_keys) {
+		file << log10(bigramProbabilities.at(bigram))
+			<< " " << get<0>(bigram) << " " << get<1>(bigram)
+			<< " " << log10(bigramBackoffWeights.at(bigram)) << endl;
+	}
+	file << endl;
+
+	file << "\\3-grams:" << endl;
+	for (const trigram_t& trigram : trigramCounts | boost::adaptors::map_keys) {
+		file << log10(trigramProbabilities.at(trigram))
+			<< " " << get<0>(trigram) << " " << get<1>(trigram) << " " << get<2>(trigram) << endl;
+	}
+	file << endl;
+
+	file << "\\end\\" << endl;
+}
+
+lambda_unique_ptr<ngram_model_t> createLanguageModel(const vector<string>& words, logmath_t& logMath) {
+	path tempFilePath = getTempFilePath();
+	createLanguageModelFile(words, tempFilePath);
+	auto deleteTempFile = gsl::finally([&]() { boost::filesystem::remove(tempFilePath); });
+
+	return lambda_unique_ptr<ngram_model_t>(
+		ngram_model_read(nullptr, tempFilePath.string().c_str(), NGRAM_ARPA, &logMath),
+		[](ngram_model_t* lm) { ngram_model_free(lm); });
+}
--- a/src/languageModels.h
+++ b/src/languageModels.h
@ -0,0 +1,6 @@
+#pragma once
+#include <sphinxbase/ngram_model.h>
+#include <vector>
+#include "tools.h"
+
+lambda_unique_ptr<ngram_model_t> createLanguageModel(const std::vector<std::string>& words, logmath_t& logMath);
--- a/src/main.cpp
+++ b/src/main.cpp
@ -12,9 +12,12 @@
 #include <gsl_util.h>
 #include "Exporter.h"
 #include "ContinuousTimeline.h"
+#include <boost/filesystem/operations.hpp>
+#include "stringTools.h"

 using std::exception;
 using std::string;
+using std::u32string;
 using std::vector;
 using std::unique_ptr;
 using std::make_unique;
@ -75,6 +78,25 @@ void addFileSink(path path, logging::Level minLevel) {
 	logging::addSink(levelFilter);
 }

+u32string readTextFile(path filePath) {
+	if (!exists(filePath)) {
+		throw std::invalid_argument(fmt::format("File {} does not exist.", filePath));
+	}
+	try {
+		boost::filesystem::ifstream file;
+		file.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+		file.open(filePath);
+		string utf8Text((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+		try {
+			return utf8ToUtf32(utf8Text);
+		} catch (...) {
+			std::throw_with_nested(std::runtime_error(fmt::format("File encoding is not ASCII or UTF-8.", filePath)));
+		}
+	} catch (...) {
+		std::throw_with_nested(std::runtime_error(fmt::format("Error reading file {0}.", filePath)));
+	}
+}
+
 int main(int argc, char *argv[]) {
 	auto pausableStderrSink = addPausableStdErrSink(logging::Level::Warn);
 	pausableStderrSink->pause();
@ -88,6 +110,7 @@ int main(int argc, char *argv[]) {
 	tclap::ValuesConstraint<logging::Level> logLevelConstraint(logLevels);
 	tclap::ValueArg<logging::Level> logLevel("", "logLevel", "The minimum log level to log", false, logging::Level::Debug, &logLevelConstraint, cmd);
 	tclap::ValueArg<string> logFileName("", "logFile", "The log file path.", false, string(), "string", cmd);
+	tclap::ValueArg<string> dialogFile("d", "dialogFile", "A file containing the text of the dialog.", false, string(), "string", cmd);
 	auto exportFormats = vector<ExportFormat>(ExportFormatConverter::get().getValues());
 	tclap::ValuesConstraint<ExportFormat> exportFormatConstraint(exportFormats);
 	tclap::ValueArg<ExportFormat> exportFormat("f", "exportFormat", "The export format.", false, ExportFormat::TSV, &exportFormatConstraint, cmd);
@ -117,6 +140,7 @@ int main(int argc, char *argv[]) {
 			ProgressBar progressBar;
 			phones = detectPhones(
 				createAudioStream(inputFileName.getValue()),
+				dialogFile.isSet() ? readTextFile(path(dialogFile.getValue())) : boost::optional<u32string>(),
 				progressBar);
 		}
 		std::cerr << "Done" << std::endl;
--- a/src/phoneExtraction.cpp
+++ b/src/phoneExtraction.cpp
@ -1,6 +1,5 @@
 #include <iostream>
 #include <boost/filesystem.hpp>
-#include <boost/algorithm/string.hpp>
 #include "phoneExtraction.h"
 #include "audio/SampleRateConverter.h"
 #include "platformTools.h"
@ -14,6 +13,9 @@
 #include <Timeline.h>
 #include <audio/voiceActivityDetection.h>
 #include <audio/AudioStreamSegment.h>
+#include "languageModels.h"
+#include "tokenization.h"
+#include "g2p.h"

 extern "C" {
 #include <pocketsphinx.h>
@ -35,33 +37,34 @@ using std::function;
 using std::regex;
 using std::regex_replace;
 using std::chrono::duration;
+using boost::optional;
+using std::u32string;

 constexpr int sphinxSampleRate = 16000;

-lambda_unique_ptr<cmd_ln_t> createConfig(path sphinxModelDirectory) {
+const path& getSphinxModelDirectory() {
+	static path sphinxModelDirectory(getBinDirectory() / "res/sphinx");
+	return sphinxModelDirectory;
+}
+
+lambda_unique_ptr<ps_decoder_t> createDecoder() {
 	lambda_unique_ptr<cmd_ln_t> config(
 		cmd_ln_init(
 			nullptr, ps_args(), true,
 			// Set acoustic model
-			"-hmm", (sphinxModelDirectory / "acoustic-model").string().c_str(),
-			// Set language model
-			"-lm", (sphinxModelDirectory / "en-us.lm.bin").string().c_str(),
-			// Set pronounciation dictionary
-			"-dict", (sphinxModelDirectory / "cmudict-en-us.dict").string().c_str(),
+			"-hmm", (getSphinxModelDirectory() / "acoustic-model").string().c_str(),
+			// Set pronunciation dictionary
+			"-dict", (getSphinxModelDirectory() / "cmudict-en-us.dict").string().c_str(),
 			// Add noise against zero silence (see http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor)
 			"-dither", "yes",
 			nullptr),
 		[](cmd_ln_t* config) { cmd_ln_free_r(config); });
 	if (!config) throw runtime_error("Error creating configuration.");

-	return config;
-}
-
-lambda_unique_ptr<ps_decoder_t> createSpeechRecognizer(cmd_ln_t& config) {
 	lambda_unique_ptr<ps_decoder_t> recognizer(
-		ps_init(&config),
+		ps_init(config.get()),
 		[](ps_decoder_t* recognizer) { ps_free(recognizer); });
-	if (!recognizer) throw runtime_error("Error creating speech recognizer.");
+	if (!recognizer) throw runtime_error("Error creating speech decoder.");

 	return recognizer;
 }
@ -141,32 +144,32 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format
 	logging::log(logLevel, message);
 }

-BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& recognizer, ProgressSink& progressSink) {
+BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& decoder, ProgressSink& progressSink) {
 	// Convert audio stream to the exact format PocketSphinx requires
 	audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);

 	// Restart timing at 0
-	ps_start_stream(&recognizer);
+	ps_start_stream(&decoder);

 	// Start recognition
-	int error = ps_start_utt(&recognizer);
+	int error = ps_start_utt(&decoder);
 	if (error) throw runtime_error("Error starting utterance processing for word recognition.");

 	// Process entire sound file
-	auto processBuffer = [&recognizer](const vector<int16_t>& buffer) {
-		int searchedFrameCount = ps_process_raw(&recognizer, buffer.data(), buffer.size(), false, false);
+	auto processBuffer = [&decoder](const vector<int16_t>& buffer) {
+		int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false);
 		if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
 	};
 	processAudioStream(*audioStream.get(), processBuffer, progressSink);

 	// End recognition
-	error = ps_end_utt(&recognizer);
+	error = ps_end_utt(&decoder);
 	if (error) throw runtime_error("Error ending utterance processing for word recognition.");

 	// Collect words
 	BoundedTimeline<string> result(audioStream->getTruncatedRange());
 	int32_t score;
-	for (ps_seg_t* it = ps_seg_iter(&recognizer, &score); it; it = ps_seg_next(it)) {
+	for (ps_seg_t* it = ps_seg_iter(&decoder, &score); it; it = ps_seg_next(it)) {
 		const char* word = ps_seg_word(it);
 		int firstFrame, lastFrame;
 		ps_seg_frames(it, &firstFrame, &lastFrame);
@ -176,35 +179,6 @@ BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_d
 	return result;
 }

-// Splits dialog into words, doing minimal preprocessing.
-// A robust solution should use TTS logic to cope with numbers, abbreviations, unknown words etc.
-vector<string> extractDialogWords(string dialog) {
-	// Convert to lower case
-	boost::algorithm::to_lower(dialog);
-
-	// Insert silences where appropriate
-	dialog = regex_replace(dialog, regex("[,;.:!?] |-"), " <sil> ");
-
-	// Remove all undesired characters
-	dialog = regex_replace(dialog, regex("[^a-z.'\\0-9<>]"), " ");
-
-	// Collapse whitespace
-	dialog = regex_replace(dialog, regex("\\s+"), " ");
-
-	// Trim
-	boost::algorithm::trim(dialog);
-
-	// Ugly hack: Remove trailing period
-	if (boost::algorithm::ends_with(dialog, ".")) {
-		dialog.pop_back();
-	}
-
-	// Split into words
-	vector<string> result;
-	boost::algorithm::split(result, dialog, boost::is_space());
-	return result;
-}
-
 s3wid_t getWordId(const string& word, dict_t& dictionary) {
 	s3wid_t wordId = dict_wordid(&dictionary, word.c_str());
 	if (wordId == BAD_S3WID) throw invalid_argument(fmt::format("Unknown word '{}'.", word));
@ -214,12 +188,12 @@ s3wid_t getWordId(const string& word, dict_t& dictionary) {
 BoundedTimeline<Phone> getPhoneAlignment(
 	const vector<s3wid_t>& wordIds,
 	unique_ptr<AudioStream> audioStream,
-	ps_decoder_t& recognizer,
+	ps_decoder_t& decoder,
 	ProgressSink& progressSink)
 {
 	// Create alignment list
 	lambda_unique_ptr<ps_alignment_t> alignment(
-		ps_alignment_init(recognizer.d2p),
+		ps_alignment_init(decoder.d2p),
 		[](ps_alignment_t* alignment) { ps_alignment_free(alignment); });
 	if (!alignment) throw runtime_error("Error creating alignment.");
 	for (s3wid_t wordId : wordIds) {
@ -233,9 +207,9 @@ BoundedTimeline<Phone> getPhoneAlignment(
 	audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);

 	// Create search structure
-	acmod_t* acousticModel = recognizer.acmod;
+	acmod_t* acousticModel = decoder.acmod;
 	lambda_unique_ptr<ps_search_t> search(
-		state_align_search_init("state_align", recognizer.config, acousticModel, alignment.get()),
+		state_align_search_init("state_align", decoder.config, acousticModel, alignment.get()),
 		[](ps_search_t* search) { ps_search_free(search); });
 	if (!search) throw runtime_error("Error creating search.");

@ -247,7 +221,7 @@ BoundedTimeline<Phone> getPhoneAlignment(
 	ps_search_start(search.get());

 	// Process entire sound file
-	auto processBuffer = [&recognizer, &acousticModel, &search](const vector<int16_t>& buffer) {
+	auto processBuffer = [&decoder, &acousticModel, &search](const vector<int16_t>& buffer) {
 		const int16* nextSample = buffer.data();
 		size_t remainingSamples = buffer.size();
 		while (acmod_process_raw(acousticModel, &nextSample, &remainingSamples, false) > 0) {
@ -266,7 +240,7 @@ BoundedTimeline<Phone> getPhoneAlignment(
 	acmod_end_utt(acousticModel);

 	// Extract phones with timestamps
-	char** phoneNames = recognizer.dict->mdef->ciname;
+	char** phoneNames = decoder.dict->mdef->ciname;
 	BoundedTimeline<Phone> result(audioStream->getTruncatedRange());
 	for (ps_alignment_iter_t* it = ps_alignment_phones(alignment.get()); it; it = ps_alignment_iter_next(it)) {
 		// Get phone
@ -285,8 +259,28 @@ BoundedTimeline<Phone> getPhoneAlignment(
 	return result;
 }

+void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
+	map<string, string> missingPronunciations;
+	for (const string& word : words) {
+		if (dict_wordid(decoder.dict, word.c_str()) == BAD_S3WID) {
+			string pronunciation;
+			for (Phone phone : wordToPhones(word)) {
+				if (pronunciation.length() > 0) pronunciation += " ";
+				pronunciation += PhoneConverter::get().toString(phone);
+			}
+			missingPronunciations[word] = pronunciation;
+		}
+	}
+	for (auto it = missingPronunciations.begin(); it != missingPronunciations.end(); ++it) {
+		bool isLast = it == --missingPronunciations.end();
+		logging::infoFormat("Unknown word '{}'. Guessing pronunciation '{}'.", it->first, it->second);
+		ps_add_word(&decoder, it->first.c_str(), it->second.c_str(), isLast);
+	}
+}
+
 BoundedTimeline<Phone> detectPhones(
 	unique_ptr<AudioStream> audioStream,
+	optional<u32string> dialog,
 	ProgressSink& progressSink)
 {
 	// Pocketsphinx doesn't like empty input
@ -305,13 +299,6 @@ BoundedTimeline<Phone> detectPhones(
 	audioStream = removeDCOffset(std::move(audioStream));

 	try {
-		// Create PocketSphinx configuration
-		path sphinxModelDirectory(getBinDirectory() / "res/sphinx");
-		auto config = createConfig(sphinxModelDirectory);
-
-		// Create speech recognizer
-		auto recognizer = createSpeechRecognizer(*config.get());
-
 		// Split audio into utterances
 		BoundedTimeline<void> utterances = detectVoiceActivity(audioStream->clone(true));

@ -323,6 +310,29 @@ BoundedTimeline<Phone> detectPhones(
 		}
 		auto utteranceProgressSinkIt = utteranceProgressSinks.begin();

+		// Create speech recognizer
+		auto decoder = createDecoder();
+
+		// Set language model
+		lambda_unique_ptr<ngram_model_t> languageModel;
+		if (dialog) {
+			// Create dialog-specific language model
+			vector<string> words = tokenizeText(*dialog);
+			words.insert(words.begin(), "<s>");
+			words.push_back("</s>");
+			languageModel = createLanguageModel(words, *decoder->lmath);
+
+			// Add any dialog-specific words to the dictionary
+			addMissingDictionaryWords(words, *decoder);
+		} else {
+			path modelPath = getSphinxModelDirectory() / "en-us.lm.bin";
+			languageModel = lambda_unique_ptr<ngram_model_t>(
+				ngram_model_read(decoder->config, modelPath.string().c_str(), NGRAM_AUTO, decoder->lmath),
+				[](ngram_model_t* lm) { ngram_model_free(lm); });
+		}
+		ps_set_lm(decoder.get(), "lm", languageModel.get());
+		ps_set_search(decoder.get(), "lm");
+
 		BoundedTimeline<Phone> result(audioStream->getTruncatedRange());
 		for (const auto& timedUtterance : utterances) {
 			ProgressMerger utteranceProgressMerger(**utteranceProgressSinkIt++);
@ -335,7 +345,7 @@ BoundedTimeline<Phone> detectPhones(
 			auto streamSegment = createSegment(audioStream->clone(true), timeRange);

 			// Get words
-			BoundedTimeline<string> words = recognizeWords(streamSegment->clone(true), *recognizer.get(), wordRecognitionProgressSink);
+			BoundedTimeline<string> words = recognizeWords(streamSegment->clone(true), *decoder.get(), wordRecognitionProgressSink);
 			for (Timed<string> timedWord : words) {
 				timedWord.getTimeRange().shift(timedUtterance.getStart());
 				logging::logTimedEvent("word", timedWord);
@ -344,12 +354,12 @@ BoundedTimeline<Phone> detectPhones(
 			// Look up words in dictionary
 			vector<s3wid_t> wordIds;
 			for (const auto& timedWord : words) {
-				wordIds.push_back(getWordId(timedWord.getValue(), *recognizer->dict));
+				wordIds.push_back(getWordId(timedWord.getValue(), *decoder->dict));
 			}
 			if (wordIds.empty()) continue;

 			// Align the words' phones with speech
-			BoundedTimeline<Phone> segmentPhones = getPhoneAlignment(wordIds, std::move(streamSegment), *recognizer.get(), alignmentProgressSink);
+			BoundedTimeline<Phone> segmentPhones = getPhoneAlignment(wordIds, std::move(streamSegment), *decoder.get(), alignmentProgressSink);
 			segmentPhones.shift(timedUtterance.getStart());
 			for (const auto& timedPhone : segmentPhones) {
 				logging::logTimedEvent("phone", timedPhone);
--- a/src/phoneExtraction.h
+++ b/src/phoneExtraction.h
@ -8,4 +8,5 @@

 BoundedTimeline<Phone> detectPhones(
 	std::unique_ptr<AudioStream> audioStream,
+	boost::optional<std::u32string> dialog,
 	ProgressSink& progressSink);
--- a/src/platformTools.cpp
+++ b/src/platformTools.cpp
@ -2,10 +2,14 @@
 #include <boost/filesystem/path.hpp>
 #include <boost/predef.h>
 #include <format.h>
+#include <boost/uuid/uuid.hpp>
+#include <boost/uuid/uuid_generators.hpp>
+#include <boost/uuid/uuid_io.hpp>

 #include "platformTools.h"

 using boost::filesystem::path;
+using std::string;

 constexpr int InitialBufferSize = 256;

@ -129,3 +133,10 @@ path getBinPath() {
 path getBinDirectory() {
 	return getBinPath().parent_path();
 }
+
+path getTempFilePath() {
+	path tempDirectory = boost::filesystem::temp_directory_path();
+	static auto generateUuid = boost::uuids::random_generator();
+	string fileName = to_string(generateUuid());
+	return tempDirectory / fileName;
+}
--- a/src/platformTools.h
+++ b/src/platformTools.h
@ -4,3 +4,4 @@

 boost::filesystem::path getBinPath();
 boost::filesystem::path getBinDirectory();
+boost::filesystem::path getTempFilePath();
--- a/src/stringTools.cpp
+++ b/src/stringTools.cpp
@ -1,5 +1,6 @@
 #include "stringTools.h"
 #include <boost/algorithm/string/trim.hpp>
+#include <codecvt>

 using std::string;
 using std::wstring;
@ -106,3 +107,12 @@ string toASCII(const u32string& s) {
 	}
 	return result;
 }
+
+u32string utf8ToUtf32(const string& s) {
+	// Visual Studio 2015 has a bug regarding char32_t:
+	// https://connect.microsoft.com/VisualStudio/feedback/details/1403302/unresolved-external-when-using-codecvt-utf8
+	// Once VS2016 is out, we can use char32_t instead of uint32_t as type arguments and get rid of the outer conversion.
+
+	std::wstring_convert<std::codecvt_utf8<uint32_t>, uint32_t> convert;
+	return u32string(reinterpret_cast<const char32_t*>(convert.from_bytes(s).c_str()));
+}
--- a/src/stringTools.h
+++ b/src/stringTools.h
@ -1,6 +1,5 @@
 #pragma once

-#include <string>
 #include <vector>
 #include <boost/optional.hpp>

@ -15,3 +14,5 @@ std::wstring latin1ToWide(const std::string& s);
 boost::optional<char> toASCII(char32_t ch);

 std::string toASCII(const std::u32string& s);
+
+std::u32string utf8ToUtf32(const std::string& s);
--- a/src/tupleHash.h
+++ b/src/tupleHash.h
@ -0,0 +1,40 @@
+#pragma once
+
+#include <tuple>
+
+namespace std {
+	
+	namespace {
+
+		template <typename T>
+		void hash_combine(size_t& seed, const T& value) {
+			seed ^= std::hash<T>()(value) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+		}
+
+		// Recursive template code derived from Matthieu M.
+		template <typename Tuple, size_t Index = tuple_size<Tuple>::value - 1>
+		struct HashValueImpl {
+			static void apply(size_t& seed, const Tuple& tuple) {
+				HashValueImpl<Tuple, Index - 1>::apply(seed, tuple);
+				hash_combine(seed, std::get<Index>(tuple));
+			}
+		};
+
+		template <typename Tuple>
+		struct HashValueImpl<Tuple, 0> {
+			static void apply(size_t& seed, const Tuple& tuple) {
+				hash_combine(seed, std::get<0>(tuple));
+			}
+		};
+	}
+
+	template <typename ... TT>
+	struct hash<tuple<TT...>> {
+		size_t operator()(const tuple<TT...>& tt) const {
+			size_t seed = 0;
+			HashValueImpl<tuple<TT...> >::apply(seed, tt);
+			return seed;
+		}
+	};
+
+}