31 #include <lm/model.hh>
50 trie<T>* insert(T
const& t)
56 trie<T>* insert(I begin, I end)
62 p = p->insert(*begin);
69 trie<T>* find(T
const& t)
71 auto i = children_.find(t);
73 return i == children_.end() ?
nullptr : &i->second;
76 size_t count(T
const& t)
const
78 return children_.count(t);
82 std::map<T, trie<T>> children_;
98 respacer(std::string
const& dictionary_path, std::string
const& language_model_path) : model_(language_model_path.c_str())
107 std::ifstream dictionary_stream{dictionary_path};
110 while(std::getline(dictionary_stream, word))
112 std::transform(word.begin(), word.end(), word.begin(), ::tolower);
113 dictionary_.insert(word.begin(), word.end())->insert(
'\0');
123 std::vector<std::string>
respace(std::string
const& letters)
125 std::pair<double, std::vector<std::string>> best(std::numeric_limits<double>::lowest(), {});
142 void respace(std::string
const& remaining_letters, std::pair<
double, std::vector<std::string>>& best, std::vector<std::tuple<double, std::string, lm::ngram::State>>
const& data = {})
144 if(remaining_letters.empty())
149 lm::ngram::State out_state;
150 double score = std::get<0>(data.back()) + model_.Score(std::get<2>(data.back()), model_.GetVocabulary().Index(
"</s>"), out_state);
152 if(score > best.first)
154 std::vector<std::string> words;
155 std::transform(data.begin(), data.end(), back_inserter(words), [](
auto const& t){
return std::get<1>(t); });
157 best = {score, words};
169 std::vector<std::string::size_type> head_lengths;
172 detail::trie<char> *p = &dictionary_;
174 for(std::string::size_type i = 0; p && i != remaining_letters.size(); ++i)
176 p = p->find(remaining_letters[i]);
178 if(p && p->count(
'\0'))
180 head_lengths.push_back(i + 1);
184 std::reverse(head_lengths.begin(), head_lengths.end());
190 for(
auto const word_length : head_lengths)
192 lm::ngram::State
const& in_state = data.size() ? std::get<2>(data.back()) : model_.BeginSentenceState();
193 lm::ngram::State out_state;
195 std::string
const word = remaining_letters.substr(0, word_length);
197 double score = data.size() ? std::get<0>(data.back()) : 0.;
198 score += model_.Score(in_state, model_.GetVocabulary().Index(word), out_state);
200 if(score > best.first)
203 d.emplace_back(score, word, out_state);
205 respace(remaining_letters.substr(word_length), best, d);
211 lm::ngram::Model model_;
212 detail::trie<char> dictionary_;
std::vector< std::string > respace(std::string const &letters)
Definition: respacer.h:123
Definition: respacer.h:93
Definition: respacer.h:42
respacer(std::string const &dictionary_path, std::string const &language_model_path)
Definition: respacer.h:98