From c9b17e19379acecd56ece7f8f9fd862e0fd6a42c Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Sat, 25 Jun 2016 21:52:04 +0200 Subject: [PATCH] Improved tokenization by taking dictionary into account --- src/phoneExtraction.cpp | 8 ++++++-- src/tokenization.cpp | 38 ++++++++++++++++++++++++++++++++++--- src/tokenization.h | 4 ++-- tests/tokenizationTests.cpp | 30 ++++++++++++++++------------- 4 files changed, 60 insertions(+), 20 deletions(-) diff --git a/src/phoneExtraction.cpp b/src/phoneExtraction.cpp index 3252792..3c98875 100644 --- a/src/phoneExtraction.cpp +++ b/src/phoneExtraction.cpp @@ -231,10 +231,14 @@ optional> getPhoneAlignment( return result; } +bool dictionaryContains(dict_t& dictionary, const string& word) { + return dict_wordid(&dictionary, word.c_str()) != BAD_S3WID; +} + void addMissingDictionaryWords(const vector& words, ps_decoder_t& decoder) { map missingPronunciations; for (const string& word : words) { - if (dict_wordid(decoder.dict, word.c_str()) == BAD_S3WID) { + if (!dictionaryContains(*decoder.dict, word)) { string pronunciation; for (Phone phone : wordToPhones(word)) { if (pronunciation.length() > 0) pronunciation += " "; @@ -287,7 +291,7 @@ BoundedTimeline detectPhones( lambda_unique_ptr languageModel; if (dialog) { // Create dialog-specific language model - vector words = tokenizeText(*dialog); + vector words = tokenizeText(*dialog, [&](const string& word) { return dictionaryContains(*decoder->dict, word); }); words.insert(words.begin(), ""); words.push_back(""); languageModel = createLanguageModel(words, *decoder->lmath); diff --git a/src/tokenization.cpp b/src/tokenization.cpp index bf57ebe..239ec45 100644 --- a/src/tokenization.cpp +++ b/src/tokenization.cpp @@ -15,6 +15,8 @@ using std::string; using std::vector; using std::regex; using std::pair; +using boost::optional; +using std::function; lambda_unique_ptr createDummyVoice() { lambda_unique_ptr voice(new_voice(), [](cst_voice* voice) { delete_voice(voice); }); @@ -51,7 +53,27 @@ vector tokenizeViaFlite(const string& text) { return result; } -vector tokenizeText(const u32string& text) { +optional findSimilarDictionaryWord(const string& word, function dictionaryContains) { + for (bool addPeriod : { false, true }) { + for (int apostropheIndex = -1; apostropheIndex <= static_cast(word.size()); ++apostropheIndex) { + string modified = word; + if (apostropheIndex != -1) { + modified.insert(apostropheIndex, "'"); + } + if (addPeriod) { + modified += "."; + } + + if (dictionaryContains(modified)) { + return modified; + } + } + } + + return boost::none; +} + +vector tokenizeText(const u32string& text, function dictionaryContains) { vector words = tokenizeViaFlite(toASCII(text)); // Join words separated by apostophes @@ -63,7 +85,7 @@ vector tokenizeText(const u32string& text) { } // Turn some symbols into words, remove the rest - vector> replacements { + const static vector> replacements { { regex("&"), "and" }, { regex("\\*"), "times" }, { regex("\\+"), "plus" }, @@ -73,12 +95,22 @@ vector tokenizeText(const u32string& text) { }; for (size_t i = 0; i < words.size(); ++i) { for (const auto& replacement : replacements) { - words[i] = std::regex_replace(words[i], replacement.first, replacement.second); + words[i] = regex_replace(words[i], replacement.first, replacement.second); } } // Remove empty words words.erase(std::remove_if(words.begin(), words.end(), [](const string& s) { return s.empty(); }), words.end()); + // Try to replace words that are not in the dictionary with similar ones that are + for (size_t i = 0; i < words.size(); ++i) { + if (!dictionaryContains(words[i])) { + optional modifiedWord = findSimilarDictionaryWord(words[i], dictionaryContains); + if (modifiedWord) { + words[i] = *modifiedWord; + } + } + } + return words; } diff --git a/src/tokenization.h b/src/tokenization.h index 36fe276..7b34961 100644 --- a/src/tokenization.h +++ b/src/tokenization.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include -std::vector tokenizeText(const std::u32string& text); \ No newline at end of file +std::vector tokenizeText(const std::u32string& text, std::function dictionaryContains); diff --git a/tests/tokenizationTests.cpp b/tests/tokenizationTests.cpp index 14973ad..dcd3941 100644 --- a/tests/tokenizationTests.cpp +++ b/tests/tokenizationTests.cpp @@ -1,6 +1,7 @@ #include #include "tokenization.h" #include +#include using namespace testing; using std::string; @@ -8,48 +9,51 @@ using std::u32string; using std::vector; using std::regex; +bool returnTrue(const string&) { + return true; +} + TEST(tokenizeText, simpleCases) { - EXPECT_THAT(tokenizeText(U""), IsEmpty()); - EXPECT_THAT(tokenizeText(U" \t\n\r\n "), IsEmpty()); + EXPECT_THAT(tokenizeText(U"", returnTrue), IsEmpty()); + EXPECT_THAT(tokenizeText(U" \t\n\r\n ", returnTrue), IsEmpty()); EXPECT_THAT( - tokenizeText(U"Wit is educated insolence."), + tokenizeText(U"Wit is educated insolence.", returnTrue), ElementsAre("wit", "is", "educated", "insolence") ); } TEST(tokenizeText, numbers) { EXPECT_THAT( - tokenizeText(U"Henry V died at 36."), + tokenizeText(U"Henry V died at 36.", returnTrue), ElementsAre("henry", "the", "fifth", "died", "at", "thirty", "six") ); EXPECT_THAT( - tokenizeText(U"I spent $4.50 on gum."), + tokenizeText(U"I spent $4.50 on gum.", returnTrue), ElementsAre("i", "spent", "four", "dollars", "fifty", "cents", "on", "gum") ); EXPECT_THAT( - tokenizeText(U"I was born in 1982."), + tokenizeText(U"I was born in 1982.", returnTrue), ElementsAre("i", "was", "born", "in", "nineteen", "eighty", "two") ); } TEST(tokenizeText, abbreviations) { EXPECT_THAT( - tokenizeText(U"I live on Dr. Dolittle Dr."), - ElementsAre("i", "live", "on", "doctor", "dolittle", "drive") + tokenizeText(U"Prof. Foo lives on Dr. Dolittle Dr.", [](const string& word) { return word == "prof."; }), + ElementsAre("prof.", "foo", "lives", "on", "doctor", "dolittle", "drive") ); } TEST(tokenizeText, apostrophes) { - // HACK: "wouldn't" really should not become "wouldnt"! EXPECT_THAT( - tokenizeText(U"'Tis said he'd wish'd for a 'bus 'cause he wouldn't walk."), - ElementsAreArray(vector{ "tis", "said", "he'd", "wish'd", "for", "a", "bus", "cause", "he", "wouldnt", "walk" }) + tokenizeText(U"'Tis said he'd wish'd for a 'bus 'cause he wouldn't walk.", [](const string& word) { return word == "wouldn't"; }), + ElementsAreArray(vector{ "tis", "said", "he'd", "wish'd", "for", "a", "bus", "cause", "he", "wouldn't", "walk" }) ); } TEST(tokenizeText, math) { EXPECT_THAT( - tokenizeText(U"'1+2*3=7"), + tokenizeText(U"'1+2*3=7", returnTrue), ElementsAre("one", "plus", "two", "times", "three", "equals", "seven") ); } @@ -64,7 +68,7 @@ TEST(tokenizeText, wordsUseLimitedCharacters) { } regex legal("^[a-z']+$"); - auto words = tokenizeText(input); + auto words = tokenizeText(input, returnTrue); for (const string& word : words) { EXPECT_TRUE(std::regex_match(word, legal)) << word; }