From f2f6f75932c680bbd543c6a620c97506a46b0d4e Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Wed, 18 Nov 2015 20:59:03 +0100 Subject: [PATCH] Refactoring - Moved phone recognition code to phone_extraction.cpp - Introduced type centiseconds - Code reorganization --- .idea/LipSync.iml | 18 ++- CMakeLists.txt | 15 ++- src/Phone.cpp | 36 ++++++ src/Phone.h | 78 ++++++++++++ src/audio_input/16kHzMonoStream.cpp | 26 ---- src/audio_input/16kHzMonoStream.h | 10 -- src/audio_input/WaveFileReader.cpp | 2 +- src/audio_input/{IOTools.h => io_tools.h} | 0 ...veFileWriter.cpp => wave_file_writing.cpp} | 4 +- .../{WaveFileWriter.h => wave_file_writing.h} | 0 src/centiseconds.cpp | 9 ++ src/centiseconds.h | 8 ++ src/main.cpp | 97 ++------------- src/phone_extraction.cpp | 116 ++++++++++++++++++ src/phone_extraction.h | 14 +++ src/platform_tools.h | 8 ++ src/platform_tools_win.cpp | 24 ++++ 17 files changed, 326 insertions(+), 139 deletions(-) create mode 100644 src/Phone.cpp create mode 100644 src/Phone.h delete mode 100644 src/audio_input/16kHzMonoStream.cpp delete mode 100644 src/audio_input/16kHzMonoStream.h rename src/audio_input/{IOTools.h => io_tools.h} (100%) rename src/audio_input/{WaveFileWriter.cpp => wave_file_writing.cpp} (96%) rename src/audio_input/{WaveFileWriter.h => wave_file_writing.h} (100%) create mode 100644 src/centiseconds.cpp create mode 100644 src/centiseconds.h create mode 100644 src/phone_extraction.cpp create mode 100644 src/phone_extraction.h create mode 100644 src/platform_tools.h create mode 100644 src/platform_tools_win.cpp diff --git a/.idea/LipSync.iml b/.idea/LipSync.iml index ada0c35..e9eb483 100644 --- a/.idea/LipSync.iml +++ b/.idea/LipSync.iml @@ -126,19 +126,25 @@ - - - + + + - - + + + + + + + + @@ -169,8 +175,8 @@ - + diff --git a/CMakeLists.txt b/CMakeLists.txt index a5381d1..d0ac8f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.3) project(LipSync) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall") set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") @@ -11,10 +11,15 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") set(Boost_USE_STATIC_LIBS ON) # Use static libs set(Boost_USE_MULTITHREADED ON) # Enable multithreading support set(Boost_USE_STATIC_RUNTIME ON) # Use static C++ runtime -find_package(Boost REQUIRED COMPONENTS filesystem locale ) +find_package(Boost REQUIRED COMPONENTS filesystem locale system) include_directories(${Boost_INCLUDE_DIRS}) -set(SOURCE_FILES src/main.cpp src/audio_input/WaveFileReader.cpp src/audio_input/WaveFileReader.h src/audio_input/ChannelDownmixer.cpp src/audio_input/ChannelDownmixer.h src/audio_input/AudioStream.h src/audio_input/SampleRateConverter.cpp src/audio_input/SampleRateConverter.h src/audio_input/16kHzMonoStream.cpp src/audio_input/16kHzMonoStream.h src/audio_input/WaveFileWriter.cpp src/audio_input/WaveFileWriter.h src/audio_input/IOTools.h) +set(SOURCE_FILES src/main.cpp src/audio_input/WaveFileReader.cpp src/audio_input/WaveFileReader.h src/audio_input/ChannelDownmixer.cpp src/audio_input/ChannelDownmixer.h src/audio_input/AudioStream.h src/audio_input/SampleRateConverter.cpp src/audio_input/SampleRateConverter.h src/audio_input/wave_file_writing.cpp src/audio_input/wave_file_writing.h src/audio_input/io_tools.h src/platform_tools.h src/phone_extraction.cpp src/phone_extraction.h src/Phone.cpp src/Phone.h src/centiseconds.cpp src/centiseconds.h) +if(WIN32) + set(SOURCE_FILES "${SOURCE_FILES};src/platform_tools_win.cpp") +else() + message(FATAL_ERROR "Target platform not supported.") +endif() include_directories("lib/sphinxbase-5prealpha-2015-08-05/include" "lib/pocketsphinx-5prealpha-2015-08-05/include" "lib/cppformat") FILE(GLOB_RECURSE SPHINX_BASE "lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/*.c") @@ -44,5 +49,5 @@ endfunction() # Copy resource files set(modelDir "${CMAKE_SOURCE_DIR}/lib/pocketsphinx-5prealpha-2015-08-05/model") -copy_after_build("${modelDir}/en-us/en-us-phone.lm.bin" "res/sphinx/acoustic_model") -copy_after_build("${modelDir}/en-us/en-us/*" "res/sphinx/language_model") +copy_after_build("${modelDir}/en-us/en-us-phone.lm.bin" "res/sphinx") +copy_after_build("${modelDir}/en-us/en-us/*" "res/sphinx/acoustic_model") diff --git a/src/Phone.cpp b/src/Phone.cpp new file mode 100644 index 0000000..f601e39 --- /dev/null +++ b/src/Phone.cpp @@ -0,0 +1,36 @@ +#include +#include "Phone.h" + +using std::string; + +template +boost::bimap +makeBimap(std::initializer_list::value_type> list) { + return boost::bimap(list.begin(), list.end()); +} + +boost::bimap phonesByName = makeBimap({ + { "None", Phone::None }, + { "Unknown", Phone::Unknown }, + { "AO", Phone::AO }, { "AA", Phone::AA }, { "IY", Phone::IY }, { "UW", Phone::UW }, + { "EH", Phone::EH }, { "IH", Phone::IH }, { "UH", Phone::UH }, { "AH", Phone::AH }, + { "AE", Phone::AE }, { "EY", Phone::EY }, { "AY", Phone::AY }, { "OW", Phone::OW }, + { "AW", Phone::AW }, { "OY", Phone::OY }, { "ER", Phone::ER }, { "P", Phone::P }, + { "B", Phone::B }, { "T", Phone::T }, { "D", Phone::D }, { "K", Phone::K }, + { "G", Phone::G }, { "CH", Phone::CH }, { "JH", Phone::JH }, { "F", Phone::F }, + { "V", Phone::V }, { "TH", Phone::TH }, { "DH", Phone::DH }, { "S", Phone::S }, + { "Z", Phone::Z }, { "SH", Phone::SH }, { "ZH", Phone::ZH }, { "HH", Phone::HH }, + { "M", Phone::M }, { "N", Phone::N }, { "NG", Phone::NG }, { "L", Phone::L }, + { "R", Phone::R }, { "Y", Phone::Y }, { "W", Phone::W }, +}); + +Phone stringToPhone(const string& s) { + auto it = phonesByName.left.find(s); + return (it != phonesByName.left.end()) ? it->second : Phone::Unknown; +} + +string phoneToString(Phone phone) { + auto it = phonesByName.right.find(phone); + return (it != phonesByName.right.end()) ? it->second : phoneToString(Phone::Unknown); +} + diff --git a/src/Phone.h b/src/Phone.h new file mode 100644 index 0000000..4928513 --- /dev/null +++ b/src/Phone.h @@ -0,0 +1,78 @@ +#ifndef LIPSYNC_PHONE_H +#define LIPSYNC_PHONE_H + +// Defines a subset of the Arpabet +enum class Phone { + None, + Unknown, + + ///////// + // Vowels + + // ... monophthongs + AO, // [ɔ] as in [o]ff, f[a]ll, fr[o]st + AA, // [ɑ] as in f[a]ther + IY, // [i] as in b[ee], sh[e] + UW, // [u] as in y[ou], n[ew], f[oo]d + EH, // [ɛ] as in r[e]d, m[e]n + IH, // [ɪ] as in b[i]g, w[i]n + UH, // [ʊ] as in sh[ou]ld, c[ou]ld + AH, // [ʌ, ə] as in b[u]t, s[u]n, [a]lone, disc[u]s + AE, // [æ] as in [a]t, b[a]t + + // ... diphthongs + EY, // [eɪ] as in s[ay], [ei]ght + AY, // [aɪ] as in m[y], wh[y], r[i]de + OW, // [oʊ] as in sh[ow], c[oa]t + AW, // [aʊ] as in h[ow], n[ow] + OY, // [ɔɪ] as in b[oy], t[oy] + + // ... r-colored + ER, // [ɝ] as in h[er], b[ir]d, h[ur]t + + ///////////// + // Consonants + + // ... stops + P, // [p] as in [p]ay + B, // [b] as in [b]uy + T, // [t] as in [t]ake + D, // [d] as in [d]ay + K, // [k] as in [k]ey + G, // [g] as in [g]o + + // ... affricates + CH, // [tʃ] as in [ch]air + JH, // [dʒ] as in [j]ust + + // ... fricatives + F, // [f] as in [f]or + V, // [v] as in [v]ery + TH, // [θ] as in [th]anks + DH, // [ð] as in [th]at + S, // [s] as in [s]ay + Z, // [z] as in [z]oo + SH, // [ʃ] as in [sh]ow + ZH, // [ʒ] as in mea[s]ure, plea[s]ure + HH, // [h] as in [h]ouse + + // ... nasals + M, // [m] as in [m]an + N, // [n] as in [no] + NG, // [ŋ] as in si[ng] + + // ... liquids + L, // [ɫ] as in [l]ate + R, // [r, ɹ] as in [r]un + + // ... semivowels + Y, // [j] as in [y]es + W // [w] as in [w]ay +}; + +Phone stringToPhone(const std::string& s); + +std::string phoneToString(Phone phone); + + +#endif //LIPSYNC_PHONE_H diff --git a/src/audio_input/16kHzMonoStream.cpp b/src/audio_input/16kHzMonoStream.cpp deleted file mode 100644 index 72fc917..0000000 --- a/src/audio_input/16kHzMonoStream.cpp +++ /dev/null @@ -1,26 +0,0 @@ -#include "16kHzMonoStream.h" -#include "WaveFileReader.h" -#include "ChannelDownmixer.h" -#include "SampleRateConverter.h" - -using std::runtime_error; - -std::unique_ptr create16kHzMonoStream(std::string fileName) { - // Create audio stream - std::unique_ptr stream(new WaveFileReader(fileName)); - - // Downmix, if required - if (stream->getChannelCount() != 1) { - stream.reset(new ChannelDownmixer(std::move(stream))); - } - - // Downsample, if required - if (stream->getFrameRate() < 16000) { - throw runtime_error("Sample rate must not be below 16kHz."); - } - if (stream->getFrameRate() != 16000) { - stream.reset(new SampleRateConverter(std::move(stream), 16000)); - } - - return stream; -} diff --git a/src/audio_input/16kHzMonoStream.h b/src/audio_input/16kHzMonoStream.h deleted file mode 100644 index 19bcd88..0000000 --- a/src/audio_input/16kHzMonoStream.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef LIPSYNC_WAVEFILEREADER16KHZMONO_H -#define LIPSYNC_WAVEFILEREADER16KHZMONO_H - -#include "AudioStream.h" -#include -#include - -std::unique_ptr create16kHzMonoStream(std::string fileName); - -#endif //LIPSYNC_WAVEFILEREADER16KHZMONO_H diff --git a/src/audio_input/WaveFileReader.cpp b/src/audio_input/WaveFileReader.cpp index ef0a4b4..bcb6322 100644 --- a/src/audio_input/WaveFileReader.cpp +++ b/src/audio_input/WaveFileReader.cpp @@ -1,6 +1,6 @@ #include #include "WaveFileReader.h" -#include "IOTools.h" +#include "io_tools.h" using std::runtime_error; using fmt::format; diff --git a/src/audio_input/IOTools.h b/src/audio_input/io_tools.h similarity index 100% rename from src/audio_input/IOTools.h rename to src/audio_input/io_tools.h diff --git a/src/audio_input/WaveFileWriter.cpp b/src/audio_input/wave_file_writing.cpp similarity index 96% rename from src/audio_input/WaveFileWriter.cpp rename to src/audio_input/wave_file_writing.cpp index 6216e2a..0dc8f20 100644 --- a/src/audio_input/WaveFileWriter.cpp +++ b/src/audio_input/wave_file_writing.cpp @@ -1,6 +1,6 @@ #include -#include "WaveFileWriter.h" -#include "IOTools.h" +#include "wave_file_writing.h" +#include "io_tools.h" using namespace little_endian; diff --git a/src/audio_input/WaveFileWriter.h b/src/audio_input/wave_file_writing.h similarity index 100% rename from src/audio_input/WaveFileWriter.h rename to src/audio_input/wave_file_writing.h diff --git a/src/centiseconds.cpp b/src/centiseconds.cpp new file mode 100644 index 0000000..ce14bd7 --- /dev/null +++ b/src/centiseconds.cpp @@ -0,0 +1,9 @@ +#include +#include +#include +#include "Centiseconds.h" + +std::ostream& operator <<(std::ostream& stream, const centiseconds cs) { + return stream << cs.count() << "cs"; +} + diff --git a/src/centiseconds.h b/src/centiseconds.h new file mode 100644 index 0000000..3d17f0f --- /dev/null +++ b/src/centiseconds.h @@ -0,0 +1,8 @@ +#ifndef LIPSYNC_CENTISECONDS_H +#define LIPSYNC_CENTISECONDS_H + +typedef std::chrono::duration centiseconds; + +std::ostream& operator <<(std::ostream& stream, const centiseconds cs); + +#endif //LIPSYNC_CENTISECONDS_H diff --git a/src/main.cpp b/src/main.cpp index f7ae262..0ffaa95 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,97 +1,16 @@ -#include -#include -#include -#include -#include #include -#include -#include "audio_input/16kHzMonoStream.h" - -using std::runtime_error; -using std::shared_ptr; -using std::unique_ptr; - -#define MODELDIR "X:/dev/projects/LipSync/lib/pocketsphinx-5prealpha-2015-08-05/model" - -// Converts a float in the range -1..1 to a signed 16-bit int -int16_t floatSampleToInt16(float sample) { - sample = std::max(sample, -1.0f); - sample = std::min(sample, 1.0f); - return static_cast(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN); -} +#include "audio_input/WaveFileReader.h" +#include "phone_extraction.h" int main(int argc, char *argv[]) { - shared_ptr config( - cmd_ln_init( - nullptr, ps_args(), true, - // Set acoustic model - "-hmm", MODELDIR "/en-us/en-us", - // Set phonetic language model - "-allphone", MODELDIR "/en-us/en-us-phone.lm.bin", - "-allphone_ci", "yes", - // The following settings are Voodoo to me. - // I copied them from http://cmusphinx.sourceforge.net/wiki/phonemerecognition - // Set beam width applied to every frame in Viterbi search - "-beam", "1e-20", - // Set beam width applied to phone transitions - "-pbeam", "1e-20", - // Set language model probability weight - "-lw", "2.0", - nullptr), - [](cmd_ln_t* config) { cmd_ln_free_r(config); }); - if (!config) throw runtime_error("Error creating configuration."); + // Create audio stream + std::unique_ptr audioStream( + new WaveFileReader(R"(C:\Users\Daniel\Desktop\audio-test\test 16000Hz 1ch 16bit.wav)")); - shared_ptr recognizer( - ps_init(config.get()), - [](ps_decoder_t* recognizer) { ps_free(recognizer); }); - if (!recognizer) throw runtime_error("Error creating speech recognizer."); + std::map phones = detectPhones(std::move(audioStream)); - unique_ptr audioStream = - create16kHzMonoStream(R"(C:\Users\Daniel\Desktop\audio-test\test 16000Hz 1ch 16bit.wav)"); - - int error = ps_start_utt(recognizer.get()); - if (error) throw runtime_error("Error starting utterance processing."); - - auto start = std::chrono::steady_clock::now(); - - std::vector buffer; - const int capacity = 1600; // 0.1 second capacity - buffer.reserve(capacity); - int sampleCount = 0; - do { - // Read to buffer - buffer.clear(); - while (buffer.size() < capacity) { - float sample; - if (!audioStream->getNextSample(sample)) break; - buffer.push_back(floatSampleToInt16(sample)); - } - - // Analyze buffer - int searchedFrameCount = ps_process_raw(recognizer.get(), buffer.data(), buffer.size(), false, false); - if (searchedFrameCount < 0) throw runtime_error("Error decoding raw audio data."); - - sampleCount += buffer.size(); - - std::cout << sampleCount / 16000.0 << "s\n"; - } while (buffer.size()); - error = ps_end_utt(recognizer.get()); - if (error) throw runtime_error("Error ending utterance processing."); - - auto end = std::chrono::steady_clock::now(); - std::cout << std::chrono::duration_cast>(end - start).count() << "\n"; - - ps_seg_t *segmentationIter; - int32 score; - for (segmentationIter = ps_seg_iter(recognizer.get(), &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) { - // Get phoneme - char const *phoneme = ps_seg_word(segmentationIter); - - // Get timing - int startFrame, endFrame; - ps_seg_frames(segmentationIter, &startFrame, &endFrame); - - printf(">>> %-5s %-5d %-5d\n", phoneme, startFrame, endFrame); + for (auto& pair : phones) { + std::cout << pair.first << ": " << phoneToString(pair.second) << "\n"; } return 0; diff --git a/src/phone_extraction.cpp b/src/phone_extraction.cpp new file mode 100644 index 0000000..272da62 --- /dev/null +++ b/src/phone_extraction.cpp @@ -0,0 +1,116 @@ +#include +#include +#include +#include "phone_extraction.h" +#include "audio_input/SampleRateConverter.h" +#include "audio_input/ChannelDownmixer.h" +#include "platform_tools.h" +using std::runtime_error; +using std::unique_ptr; +using std::shared_ptr; +using std::string; +using std::map; +using boost::filesystem::path; + +unique_ptr to16kHzMono(unique_ptr stream) { + // Downmix, if required + if (stream->getChannelCount() != 1) { + stream.reset(new ChannelDownmixer(std::move(stream))); + } + + // Downsample, if required + if (stream->getFrameRate() < 16000) { + throw runtime_error("Sample rate must not be below 16kHz."); + } + if (stream->getFrameRate() != 16000) { + stream.reset(new SampleRateConverter(std::move(stream), 16000)); + } + + return stream; +} + +// Converts a float in the range -1..1 to a signed 16-bit int +int16_t floatSampleToInt16(float sample) { + sample = std::max(sample, -1.0f); + sample = std::min(sample, 1.0f); + return static_cast(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN); +} + +map detectPhones(unique_ptr audioStream) { + // Convert audio stream to the exact format PocketSphinx requires + audioStream = to16kHzMono(std::move(audioStream)); + + // Create PocketSphinx configuration + path binDirectory(getBinDirectory()); + path resDirectory(binDirectory.parent_path() / "res"); + shared_ptr config( + cmd_ln_init( + nullptr, ps_args(), true, + // Set acoustic model + "-hmm", (resDirectory / "sphinx/acoustic_model").string().c_str(), + // Set phonetic language model + "-allphone", (resDirectory / "sphinx/en-us-phone.lm.bin").string().c_str(), + "-allphone_ci", "yes", + // The following settings are taken from http://cmusphinx.sourceforge.net/wiki/phonemerecognition + // Set beam width applied to every frame in Viterbi search + "-beam", "1e-20", + // Set beam width applied to phone transitions + "-pbeam", "1e-20", + // Set language model probability weight + "-lw", "2.0", + nullptr), + [](cmd_ln_t* config) { cmd_ln_free_r(config); }); + if (!config) throw runtime_error("Error creating configuration."); + + // Create phone recognizer + shared_ptr recognizer( + ps_init(config.get()), + [](ps_decoder_t* recognizer) { ps_free(recognizer); }); + if (!recognizer) throw runtime_error("Error creating speech recognizer."); + + // Start recognition + int error = ps_start_utt(recognizer.get()); + if (error) throw runtime_error("Error starting utterance processing."); + + // Process entire sound file + std::vector buffer; + const int capacity = 1600; // 0.1 second capacity + buffer.reserve(capacity); + int sampleCount = 0; + do { + // Read to buffer + buffer.clear(); + while (buffer.size() < capacity) { + float sample; + if (!audioStream->getNextSample(sample)) break; + buffer.push_back(floatSampleToInt16(sample)); + } + + // Analyze buffer + int searchedFrameCount = ps_process_raw(recognizer.get(), buffer.data(), buffer.size(), false, false); + if (searchedFrameCount < 0) throw runtime_error("Error decoding raw audio data."); + + sampleCount += buffer.size(); + } while (buffer.size()); + error = ps_end_utt(recognizer.get()); + if (error) throw runtime_error("Error ending utterance processing."); + + // Collect results into map + map result; + ps_seg_t *segmentationIter; + int32 score; + int endFrame; + for (segmentationIter = ps_seg_iter(recognizer.get(), &score); segmentationIter; segmentationIter = ps_seg_next(segmentationIter)) { + // Get phone + char const *phone = ps_seg_word(segmentationIter); + + // Get timing + int startFrame; + ps_seg_frames(segmentationIter, &startFrame, &endFrame); + + result[centiseconds(startFrame)] = stringToPhone(phone); + } + // Add dummy entry past the last phone + result[centiseconds(endFrame + 1)] = Phone::None; + return result; +} diff --git a/src/phone_extraction.h b/src/phone_extraction.h new file mode 100644 index 0000000..844f30a --- /dev/null +++ b/src/phone_extraction.h @@ -0,0 +1,14 @@ +#ifndef LIPSYNC_PHONE_EXTRACTION_H +#define LIPSYNC_PHONE_EXTRACTION_H + +#include +#include +#include +#include +#include "audio_input/AudioStream.h" +#include "Phone.h" +#include "centiseconds.h" + +std::map detectPhones(std::unique_ptr audioStream); + +#endif //LIPSYNC_PHONE_EXTRACTION_H diff --git a/src/platform_tools.h b/src/platform_tools.h new file mode 100644 index 0000000..55d27ea --- /dev/null +++ b/src/platform_tools.h @@ -0,0 +1,8 @@ +#ifndef LIPSYNC_PLATFORM_TOOLS_H +#define LIPSYNC_PLATFORM_TOOLS_H + +#include + +boost::filesystem::path getBinDirectory(); + +#endif //LIPSYNC_PLATFORM_TOOLS_H diff --git a/src/platform_tools_win.cpp b/src/platform_tools_win.cpp new file mode 100644 index 0000000..d500b2a --- /dev/null +++ b/src/platform_tools_win.cpp @@ -0,0 +1,24 @@ +#include "platform_tools.h" + +#include + +boost::filesystem::path getBinDirectory() { + std::vector executablePath(MAX_PATH); + + // Try to get the executable path with a buffer of MAX_PATH characters. + DWORD result = GetModuleFileNameW(0, executablePath.data(), executablePath.size()); + + // As long the function returns the buffer size, it is indicating that the buffer + // was too small. Keep doubling the buffer size until it fits. + while(result == executablePath.size()) { + executablePath.resize(executablePath.size() * 2); + result = GetModuleFileNameW(0, executablePath.data(), executablePath.size()); + } + + // If the function returned 0, something went wrong + if (result == 0) { + throw std::runtime_error("Could not determine path of bin directory."); + } + + return boost::filesystem::path(executablePath.data()).parent_path(); +}