From c9b17e19379acecd56ece7f8f9fd862e0fd6a42c Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Sat, 25 Jun 2016 21:52:04 +0200
Subject: [PATCH] Improved tokenization by taking dictionary into account

---
 src/phoneExtraction.cpp     |  8 ++++++--
 src/tokenization.cpp        | 38 ++++++++++++++++++++++++++++++++++---
 src/tokenization.h          |  4 ++--
 tests/tokenizationTests.cpp | 30 ++++++++++++++++-------------
 4 files changed, 60 insertions(+), 20 deletions(-)
diff --git a/src/phoneExtraction.cpp b/src/phoneExtraction.cpp
index 3252792..3c98875 100644
--- a/src/phoneExtraction.cpp
+++ b/src/phoneExtraction.cpp
@@ -231,10 +231,14 @@ optional<BoundedTimeline<Phone>> getPhoneAlignment(
 	return result;
 }
 
+bool dictionaryContains(dict_t& dictionary, const string& word) {
+	return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID;
+}
+
 void addMissingDictionaryWords(const vector<string>& words, ps_decoder_t& decoder) {
 	map<string, string> missingPronunciations;
 	for (const string& word : words) {
-		if (dict_wordid(decoder.dict, word.c_str()) == BAD_S3WID) {
+		if (!dictionaryContains(*decoder.dict, word)) {
 			string pronunciation;
 			for (Phone phone : wordToPhones(word)) {
 				if (pronunciation.length() > 0) pronunciation += " ";
@@ -287,7 +291,7 @@ BoundedTimeline<Phone> detectPhones(
 		lambda_unique_ptr<ngram_model_t> languageModel;
 		if (dialog) {
 			// Create dialog-specific language model
-			vector<string> words = tokenizeText(*dialog);
+			vector<string> words = tokenizeText(*dialog, [&](const string& word) { return dictionaryContains(*decoder->dict, word); });
 			words.insert(words.begin(), "<s>");
 			words.push_back("</s>");
 			languageModel = createLanguageModel(words, *decoder->lmath);
diff --git a/src/tokenization.cpp b/src/tokenization.cpp
index bf57ebe..239ec45 100644
--- a/src/tokenization.cpp
+++ b/src/tokenization.cpp
@@ -15,6 +15,8 @@ using std::string;
 using std::vector;
 using std::regex;
 using std::pair;
+using boost::optional;
+using std::function;
 
 lambda_unique_ptr<cst_voice> createDummyVoice() {
 	lambda_unique_ptr<cst_voice> voice(new_voice(), [](cst_voice* voice) { delete_voice(voice); });
@@ -51,7 +53,27 @@ vector<string> tokenizeViaFlite(const string& text) {
 	return result;
 }
 
-vector<string> tokenizeText(const u32string& text) {
+optional<string> findSimilarDictionaryWord(const string& word, function<bool(const string&)> dictionaryContains) {
+	for (bool addPeriod : { false, true }) {
+		for (int apostropheIndex = -1; apostropheIndex <= static_cast<int>(word.size()); ++apostropheIndex) {
+			string modified = word;
+			if (apostropheIndex != -1) {
+				modified.insert(apostropheIndex, "'");
+			}
+			if (addPeriod) {
+				modified += ".";
+			}
+
+			if (dictionaryContains(modified)) {
+				return modified;
+			}
+		}
+	}
+
+	return boost::none;
+}
+
+vector<string> tokenizeText(const u32string& text, function<bool(const string&)> dictionaryContains) {
 	vector<string> words = tokenizeViaFlite(toASCII(text));
 
 	// Join words separated by apostophes
@@ -63,7 +85,7 @@ vector<string> tokenizeText(const u32string& text) {
 	}
 
 	// Turn some symbols into words, remove the rest
-	vector<pair<regex, string>> replacements {
+	const static vector<pair<regex, string>> replacements {
 		{ regex("&"), "and" },
 		{ regex("\\*"), "times" },
 		{ regex("\\+"), "plus" },
@@ -73,12 +95,22 @@ vector<string> tokenizeText(const u32string& text) {
 	};
 	for (size_t i = 0; i < words.size(); ++i) {
 		for (const auto& replacement : replacements) {
-			words[i] = std::regex_replace(words[i], replacement.first, replacement.second);
+			words[i] = regex_replace(words[i], replacement.first, replacement.second);
 		}
 	}
 
 	// Remove empty words
 	words.erase(std::remove_if(words.begin(), words.end(), [](const string& s) { return s.empty(); }), words.end());
 
+	// Try to replace words that are not in the dictionary with similar ones that are
+	for (size_t i = 0; i < words.size(); ++i) {
+		if (!dictionaryContains(words[i])) {
+			optional<string> modifiedWord = findSimilarDictionaryWord(words[i], dictionaryContains);
+			if (modifiedWord) {
+				words[i] = *modifiedWord;
+			}
+		}
+	}
+
 	return words;
 }
diff --git a/src/tokenization.h b/src/tokenization.h
index 36fe276..7b34961 100644
--- a/src/tokenization.h
+++ b/src/tokenization.h
@@ -1,6 +1,6 @@
 #pragma once
 
 #include <vector>
-#include <string>
+#include <functional>
 
-std::vector<std::string> tokenizeText(const std::u32string& text);
\ No newline at end of file
+std::vector<std::string> tokenizeText(const std::u32string& text, std::function<bool(const std::string&)> dictionaryContains);
diff --git a/tests/tokenizationTests.cpp b/tests/tokenizationTests.cpp
index 14973ad..dcd3941 100644
--- a/tests/tokenizationTests.cpp
+++ b/tests/tokenizationTests.cpp
@@ -1,6 +1,7 @@
 ﻿#include <gmock/gmock.h>
 #include "tokenization.h"
 #include <regex>
+#include <unordered_set>
 
 using namespace testing;
 using std::string;
@@ -8,48 +9,51 @@ using std::u32string;
 using std::vector;
 using std::regex;
 
+bool returnTrue(const string&) {
+	return true;
+}
+
 TEST(tokenizeText, simpleCases) {
-	EXPECT_THAT(tokenizeText(U""), IsEmpty());
-	EXPECT_THAT(tokenizeText(U"  \t\n\r\n "), IsEmpty());
+	EXPECT_THAT(tokenizeText(U"", returnTrue), IsEmpty());
+	EXPECT_THAT(tokenizeText(U"  \t\n\r\n ", returnTrue), IsEmpty());
 	EXPECT_THAT(
-		tokenizeText(U"Wit is educated insolence."),
+		tokenizeText(U"Wit is educated insolence.", returnTrue),
 		ElementsAre("wit", "is", "educated", "insolence")
 	);
 }
 
 TEST(tokenizeText, numbers) {
 	EXPECT_THAT(
-		tokenizeText(U"Henry V died at 36."),
+		tokenizeText(U"Henry V died at 36.", returnTrue),
 		ElementsAre("henry", "the", "fifth", "died", "at", "thirty", "six")
 	);
 	EXPECT_THAT(
-		tokenizeText(U"I spent $4.50 on gum."),
+		tokenizeText(U"I spent $4.50 on gum.", returnTrue),
 		ElementsAre("i", "spent", "four", "dollars", "fifty", "cents", "on", "gum")
 	);
 	EXPECT_THAT(
-		tokenizeText(U"I was born in 1982."),
+		tokenizeText(U"I was born in 1982.", returnTrue),
 		ElementsAre("i", "was", "born", "in", "nineteen", "eighty", "two")
 	);
 }
 
 TEST(tokenizeText, abbreviations) {
 	EXPECT_THAT(
-		tokenizeText(U"I live on Dr. Dolittle Dr."),
-		ElementsAre("i", "live", "on", "doctor", "dolittle", "drive")
+		tokenizeText(U"Prof. Foo lives on Dr. Dolittle Dr.", [](const string& word) { return word == "prof."; }),
+		ElementsAre("prof.", "foo", "lives", "on", "doctor", "dolittle", "drive")
 	);
 }
 
 TEST(tokenizeText, apostrophes) {
-	// HACK: "wouldn't" really should not become "wouldnt"!
 	EXPECT_THAT(
-		tokenizeText(U"'Tis said he'd wish'd for a 'bus 'cause he wouldn't walk."),
-		ElementsAreArray(vector<string>{ "tis", "said", "he'd", "wish'd", "for", "a", "bus", "cause", "he", "wouldnt", "walk" })
+		tokenizeText(U"'Tis said he'd wish'd for a 'bus 'cause he wouldn't walk.", [](const string& word) { return word == "wouldn't"; }),
+		ElementsAreArray(vector<string>{ "tis", "said", "he'd", "wish'd", "for", "a", "bus", "cause", "he", "wouldn't", "walk" })
 	);
 }
 
 TEST(tokenizeText, math) {
 	EXPECT_THAT(
-		tokenizeText(U"'1+2*3=7"),
+		tokenizeText(U"'1+2*3=7", returnTrue),
 		ElementsAre("one", "plus", "two", "times", "three", "equals", "seven")
 	);
 }
@@ -64,7 +68,7 @@ TEST(tokenizeText, wordsUseLimitedCharacters) {
 	}
 
 	regex legal("^[a-z']+$");
-	auto words = tokenizeText(input);
+	auto words = tokenizeText(input, returnTrue);
 	for (const string& word : words) {
 		EXPECT_TRUE(std::regex_match(word, legal)) << word;
 	}