respace  1.0
respacer.h
Go to the documentation of this file.
1 /*
2  (C) Copyright Thierry Seegers 2015. Distributed under the following license:
3 
4  Boost Software License - Version 1.0 - August 17th, 2003
5 
6  Permission is hereby granted, free of charge, to any person or organization
7  obtaining a copy of the software and accompanying documentation covered by
8  this license (the "Software") to use, reproduce, display, distribute,
9  execute, and transmit the Software, and to prepare derivative works of the
10  Software, and to permit third-parties to whom the Software is furnished to
11  do so, all subject to the following:
12 
13  The copyright notices in the Software and this entire statement, including
14  the above license grant, this restriction and the following disclaimer,
15  must be included in all copies of the Software, in whole or in part, and
16  all derivative works of the Software, unless such copies or derivative
17  works are solely in the form of machine-executable object code generated by
18  a source language processor.
19 
20  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
23  SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
24  FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
25  ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26  DEALINGS IN THE SOFTWARE.
27  */
28 
29 #pragma once
30 
31 #include <lm/model.hh>
32 
33 #include <algorithm>
34 #include <cctype>
35 #include <fstream>
36 #include <limits>
37 #include <map>
38 #include <string>
39 #include <utility>
40 #include <vector>
41 
42 namespace detail
43 {
44  // Minimal trie.
45  // We use it as a special spell-checker that can indicate whether a string is a valid word, an incomplete valid word or neither.
46  template<typename T>
47  class trie
48  {
49  public:
50  trie<T>* insert(T const& t)
51  {
52  return &children_[t];
53  }
54 
55  template<typename I>
56  trie<T>* insert(I begin, I end)
57  {
58  trie<T> *p = this;
59 
60  while(begin != end)
61  {
62  p = p->insert(*begin);
63  ++begin;
64  }
65 
66  return p;
67  }
68 
69  trie<T>* find(T const& t)
70  {
71  auto i = children_.find(t);
72 
73  return i == children_.end() ? nullptr : &i->second;
74  }
75 
76  size_t count(T const& t) const
77  {
78  return children_.count(t);
79  }
80 
81  private:
82  std::map<T, trie<T>> children_;
83  };
84 
85 }
86 
93 class respacer
94 {
95 public:
98  respacer(std::string const& dictionary_path, std::string const& language_model_path) : model_(language_model_path.c_str())
99  {
100  // Populate our dictionary trie.
101  // We'll insert a '\0' to indicate valid words. e.g.:
102  // b->\0
103  // ->e->\0
104  // ->a
105  // ->n->\0
106  // That way, we know that "be" is a valid word and that "bea" leads to a valid word.
107  std::ifstream dictionary_stream{dictionary_path};
108 
109  std::string word;
110  while(std::getline(dictionary_stream, word))
111  {
112  std::transform(word.begin(), word.end(), word.begin(), ::tolower);
113  dictionary_.insert(word.begin(), word.end())->insert('\0');
114  }
115  }
116 
123  std::vector<std::string> respace(std::string const& letters)
124  {
125  std::pair<double, std::vector<std::string>> best(std::numeric_limits<double>::lowest(), {});
126 
127  respace(letters, best);
128 
129  return best.second;
130  }
131 
132 private:
142  void respace(std::string const& remaining_letters, std::pair<double, std::vector<std::string>>& best, std::vector<std::tuple<double, std::string, lm::ngram::State>> const& data = {})
143  {
144  if(remaining_letters.empty())
145  {
146  // No more letters, we have a sequence that has outscored the current best.
147  // Additionally score the end-of-sentence tag and keep the results if it still outscores the current best.
148 
149  lm::ngram::State out_state;
150  double score = std::get<0>(data.back()) + model_.Score(std::get<2>(data.back()), model_.GetVocabulary().Index("</s>"), out_state);
151 
152  if(score > best.first)
153  {
154  std::vector<std::string> words;
155  std::transform(data.begin(), data.end(), back_inserter(words), [](auto const& t){ return std::get<1>(t); });
156 
157  best = {score, words};
158  }
159  }
160  else
161  {
162  // We still have letters left to build a sentence, analyze them.
163 
164  // From the remaining letters string, finds all valid words at its head (e.g. "beefzz" -> {"b", "be", "bee", "beef"}
165  //
166  // Two important efficiency gains:
167  // 1. Using a \ref trie<char> to find all valid words. (I tried a sorted vector and it did not perform as well.)
168  // 2. Returns lengths from longest to shortest because sentences with fewer words tend to outscore sentences with more words.
169  std::vector<std::string::size_type> head_lengths;
170 
171  {
172  detail::trie<char> *p = &dictionary_;
173 
174  for(std::string::size_type i = 0; p && i != remaining_letters.size(); ++i)
175  {
176  p = p->find(remaining_letters[i]);
177 
178  if(p && p->count('\0'))
179  {
180  head_lengths.push_back(i + 1);
181  }
182  }
183 
184  std::reverse(head_lengths.begin(), head_lengths.end());
185  }
186 
187  // For all valid words found at the head of the remaining letters, score the word taking the current model state into account.
188  // If this intermediary score is better than the current best score, analyze the leftover letters.
189  // Otherwise, stop analyzing this path, it can't get better.
190  for(auto const word_length : head_lengths)
191  {
192  lm::ngram::State const& in_state = data.size() ? std::get<2>(data.back()) : model_.BeginSentenceState();
193  lm::ngram::State out_state;
194 
195  std::string const word = remaining_letters.substr(0, word_length);
196 
197  double score = data.size() ? std::get<0>(data.back()) : 0.;
198  score += model_.Score(in_state, model_.GetVocabulary().Index(word), out_state);
199 
200  if(score > best.first)
201  {
202  auto d = data;
203  d.emplace_back(score, word, out_state);
204 
205  respace(remaining_letters.substr(word_length), best, d);
206  }
207  }
208  }
209  }
210 
211  lm::ngram::Model model_;
212  detail::trie<char> dictionary_;
213 };
214 
std::vector< std::string > respace(std::string const &letters)
Definition: respacer.h:123
Definition: respacer.h:93
Definition: respacer.h:42
respacer(std::string const &dictionary_path, std::string const &language_model_path)
Definition: respacer.h:98