1
2
3 import array;
4 import os
5 import sys
6 sys.path.append(os.path.join(os.path.dirname(__file__), "../../lib"));
7 from ternarytree import TernarySearchTree;
8
9
10
11
12
13
14
15
17 '''
18 Word lookup based on a Patricia tree (a.k.a. Radix Tree, a.k.a. Trie data structure).
19 This data structure is efficiently searchable by the prefix of words. Such a prefix search takes
20 a string prefix, and returns all dictionary words that begin with that prefix.
21
22 This class ingests rank/word pair files in a given directory. The ranks are intended
23 to be relative usage frequencies. The class manages these frequency ranks.
24
25 Public methods:
26
27 - add(word)
28 - contains(word)
29 - prefix_search(prefix)
30 - rank(word)
31 '''
32
33 DEFAULT_USER_DICT_FILE_NAME = "dictUserRankAndWord.txt";
34 USER_DICT_FILE_PATH = None;
35
36 - def __init__(self, dictDir=None, userDictFilePath=None):
37 '''
38 Keep track of a Python dict mapping from word to
39 its frequency rank, of the total number of entries, and
40 the number of word files ingested from disk.
41 @param dictDir: full path to directory that contains the dictionary files. If None, a
42 built-in dictionary of 6000 words is used.
43 @type dictDir: string
44 @param userDictFilePath: full path to within a user dictionary. That file must be organized like
45 the other dictionary files.
46 @type userDictFilePath: string
47 '''
48 super(WordCollection, self).__init__();
49 if dictDir is None:
50 self.dictDir = os.path.join(os.path.dirname(__file__), "dict_files");
51 else:
52 self.dictDir = dictDir;
53
54 if userDictFilePath is None:
55 WordCollection.USER_DICT_FILE_PATH = os.path.join(self.dictDir, WordCollection.DEFAULT_USER_DICT_FILE_NAME)
56 else:
57 WordCollection.USER_DICT_FILE_PATH = userDictFilePath;
58
59 self.realWordToFrequencyRanks = {};
60 self.numEntries = 0;
61 self.numDictFilesIngested = 0;
62 self.createDictStructureFromFiles();
63
65 '''
66 Goes through the self.dictDir directory on disk, and reads all the
67 files there. Each file must be a list of whitespace-separated
68 frequency-rank / word pairs. Assumes that self.dictDir is set to directory
69 of dictionary files.
70 @raise ValueError: if a rank in any of the files cannot be read as an integer.
71 '''
72 for rankAndWordListFileName in os.listdir(self.dictDir):
73 with open(os.path.realpath(os.path.join(self.dictDir, rankAndWordListFileName))) as fd:
74 self.numDictFilesIngested += 1;
75
76 rankAndWordLists = fd.read();
77 for line in rankAndWordLists.splitlines():
78 if len(line) == 0:
79 continue;
80
81 try:
82 (rank, word) = line.split(None, 1);
83 except:
84 raise ValueError("Word file file %s contains a line that does not contain a numeric rank, followed by a word: '%s'" %
85 (rankAndWordListFileName, line));
86 try:
87 rankInt = int(rank);
88 except ValueError:
89 raise ValueError("Word file %s contains a line with a non-numeric rank %s" %
90 (rankAndWordListFileName, rank));
91 self.insert(word, rankInt);
92
94 '''
95 Given a word, checks whether the word is already in
96 the in-memory dictionary. If so, does nothing and returns False;
97 Else appends the word to dict_files/dictUserRankAndWord.txt
98 with the provided rank; then returns True
99 @param newWord: word to be added to the user dictionary.
100 @type newWord: string
101 @param rankInt: frequency rank of the word. Rank 0 is most important; 1 is
102 second-most important, etc. OK to have ties.
103 @type rankInt: int
104 '''
105
106 newWord = newWord.encode("UTF-8");
107 if self.contains(newWord):
108 return False;
109 with open(os.path.realpath(WordCollection.USER_DICT_FILE_PATH), 'a') as fd:
110 fd.write(str(rankInt) + "\t" + newWord + "\n");
111
112 self.insert(newWord, rankInt);
113 return True;
114
115
116
117 - def insert(self, word, rankInt=None):
118 '''
119 Insert one word into the word collection.
120 @param word: word to insert.
121 @type word: string
122 @param rankInt: Optionally the frequency rank of the word. If None, no rank is recorded,
123 and subsequent calls to the rank() method will fail.
124 @type rankInt: int
125 @raise ValueError: if word is not valid or empty.
126 '''
127 if not self.contains(word):
128 self.numEntries += 1;
129 self.add(word);
130 if rankInt is not None:
131 self.realWordToFrequencyRanks[word.encode('UTF-8')] = rankInt;
132
133 - def rank(self, word):
134 '''
135 Return the frequency rank of the given word in the collection. I is
136 an error to request the rank of a word that is not in the collection,
137 or of a word whose rank was never specified in an ingestion file or
138 as part of an insert() call.
139 @param word: the word whose frequency rank is requested.
140 @type word: string
141 @raise KeyError: if word or rank are not present in the word collection.
142 '''
143 return self.realWordToFrequencyRanks[word.encode('UTF-8')];
144
146 '''
147 Returns all dictionary entries that begin with the string word.
148 If the optional cutoffRank is specified, it limits the length of the
149 returned list to include only the top cutoffRank words. Example, if
150 cutoffRank=5, only the five most highly ranked dictionary entries
151 are returned. Also, if cutoffRank is specified, the returned list
152 is sorted by decreasing word rank. If cutoffRank is not specified,
153 or is None, the returned list is unsorted.
154 @param word: prefix to search by.
155 @type word: string.
156 @param cutoffRank: Number of most highly ranked dictionary entries to return in rank-sorted order.
157 @type cutoffRank: int
158 '''
159
160 if cutoffRank is not None:
161 if not isinstance(cutoffRank, int):
162 raise TypeError("Parameter cutoffRank for prefix_search must be an integer.");
163
164 matchingWords = super(WordCollection, self).prefix_search(word);
165
166
167
168 finalWords = [];
169 for candidate in matchingWords:
170 if self.startsWith(candidate,word):
171 finalWords.append(candidate);
172 if cutoffRank is not None:
173
174 finalWords.sort(key=self.rank);
175 return finalWords[:cutoffRank]
176 return finalWords;
177
179 '''
180 True if word starts with, or is equal to prefix. Else False.
181 @param word: word to examine.
182 @type word: string.
183 @param prefix: string that word is required to start with for a return of True
184 @type prefix: string
185 '''
186 if len(prefix) > len(word):
187 return False;
188 wordFoundFrag = word[:len(prefix)];
189 return word[:len(prefix)] == prefix;
190
192 '''
193 Return number of words in the collection.
194 '''
195 return self.numEntries;
196
197
198
200 '''
201 Instances behave as the superclass WordCollection. However, all added words
202 are encoded as if entered via a telephone pad. Each letter group of the telephone
203 pad is represented by its first letter. Example: "and" --> "amd" (phone buttons 1,5,2).
204 <p>
205 The class can thus be used to search words by entering for each real word's letters 'c'
206 the first letter of the telephone pad that contains 'c'. For word input, clients need
207 not concern themselves with this encoding. That transformation occurs automatically.
208 <p>
209 However, calls to search_prefix() or contains() must encode the real words with the
210 encoded version. Thus, instead of calling myColl.contains("and"), the client would
211 call myColl.contains("amd"). Method encodeWord() takes a real word and returns the
212 encoded version.
213 <p>
214 Method search_prefix() will usually contain a larger number of 'remaining possible words'
215 than a regular WordCollection. This is because the mapping from encoded to real words is
216 one-to-many.
217 '''
218
219 symbolToEnc = {
220 'ABC' : 'a',
221 'DEF' : 'd',
222 'GHI' : 'g',
223 'JKL' : 'j',
224 'MNO' : 'm',
225 'PQR' : 'p',
226 'STUV': 's',
227 'WXYZ': 'w'
228 }
229
230 encToSymbol = {'a' : 'ABC',
231 'd' : 'DEF',
232 'g' : 'GHI',
233 'j' : 'JKL',
234 'm' : 'MNO',
235 'p' : 'PQR',
236 's' : 'STUV',
237 'w' : 'WXYZ'
238 }
239
240 alphabet = {
241 'a' : 'a',
242 'b' : 'a',
243 'c' : 'a',
244 'd' : 'd',
245 'e' : 'd',
246 'f' : 'd',
247 'g' : 'g',
248 'h' : 'g',
249 'i' : 'g',
250 'j' : 'j',
251 'k' : 'j',
252 'l' : 'j',
253 'm' : 'm',
254 'n' : 'm',
255 'o' : 'm',
256 'p' : 'p',
257 'q' : 'p',
258 'r' : 'p',
259 's' : 's',
260 't' : 's',
261 'u' : 's',
262 'v' : 's',
263 'w' : 'w',
264 'x' : 'w',
265 'y' : 'w',
266 'z' : 'w'
267 }
268
270 '''
271 Maintain a data structure that maps each encoded word
272 to all the possible equivalent real words. We call these
273 multiple words 'collisions.'
274 '''
275 self.encWordToRealWords = {};
276 super(TelPadEncodedWordCollection, self).__init__();
277
279 '''
280 Prefix search operates as for the WordCollection superclass, but takes
281 as input a telephone pad encoded prefix. Returns an array of all real
282 words that could complete the given prefix.
283 @param encWord: the encoded prefix
284 @type encWord: string
285 @raise ValueError: if the mapping from encoded words to collisions is corrupted. Never caused by caller.
286 '''
287 if len(encWord) == 0:
288 return [];
289
290
291 encMatches = super(TelPadEncodedWordCollection, self).prefix_search(encWord);
292
293
294
295 realWordMatches = [];
296 for encWord in encMatches:
297 try:
298 realWordCollisions = self.encWordToRealWords[encWord];
299 except KeyError:
300 raise ValueError("An encoded tel pad word did not have a mapping to at least one real word: %s" % encWord);
301 realWordMatches.extend(realWordCollisions);
302 return realWordMatches;
303
305 '''
306 Given a string label as seen on the JBoard button pad,
307 return the single letter that represents the group of
308 label chars in this class. Ex: "ABC" returns symbolToEnc["ABC"] == 'a'.
309 @param label: Group of chars on a JBoard (more or less telephone pad):
310 @type label: string
311 @raise KeyError: if passed-in button label is not a true button label.
312 '''
313 return TelPadEncodedWordCollection.symbolToEnc[label];
314
316 '''
317 Given the encoding of a button label, return the
318 original label. Ex.: 'a' ==> 'ABC', 's' ==> 'STUV'
319 @param encLetter: label encoding.
320 @type encLetter: string
321 @raise KeyError: if passed-in letter is not an encoded label.
322 '''
323 return TelPadEncodedWordCollection.encToSymbol[encLetter];
324
326 '''
327 Given a real word, return its telephone pad encoded equivalent.
328 @param word: the real word to encode.
329 @type word: string
330 @return: the encoded equivalent string.
331 '''
332 translation = array.array('c', word);
333 for i, char in enumerate(word.lower()):
334 try:
335 translation[i] = TelPadEncodedWordCollection.alphabet[char];
336 except KeyError:
337
338 translation[i] = char;
339 return translation.tostring();
340
341
342 - def insert(self, newRealWord, newRankInt):
343 '''
344 Takes a real, that is unencoded word, encodes it, and
345 inserts it into the (in-memory) tree. Updates the mapping from encoded
346 words to their collisions.
347 @param newRealWord: the unencoded word to insert.
348 @type newRealWord: string
349 @param newRankInt: the new word's frequency rank.
350 @type newRankInt: int
351 @raise ValueError: if the encoded-word to collisions data structure is corrupted. Not caused by caller.
352 '''
353 newEncWord = self.encodeWord(newRealWord);
354 super(TelPadEncodedWordCollection, self).insert(newEncWord);
355 self.realWordToFrequencyRanks[newRealWord] = newRankInt;
356 try:
357 existingCollisions = self.encWordToRealWords[newEncWord];
358 except KeyError:
359 existingCollisions = [];
360
361
362 if len(existingCollisions) == 0:
363 existingCollisions.append(newRealWord);
364 self.encWordToRealWords[newEncWord] = [newRealWord];
365 else:
366 for (pos, realCollisionWord) in enumerate(existingCollisions):
367
368
369 if (realCollisionWord == newRealWord):
370 return;
371
372
373 try:
374 realCollisionWordRank = self.realWordToFrequencyRanks[realCollisionWord];
375 except KeyError:
376 raise ValueError("Data structure that maps telephone-pad-encoded words to lists of true words should contain %s, but does not." % realCollisionWord);
377 if newRankInt <= realCollisionWordRank:
378 existingCollisions.insert(pos, newRealWord);
379 self.encWordToRealWords[newEncWord] = existingCollisions;
380 return;
381
382 existingCollisions.append(newRealWord);
383 self.encWordToRealWords[newEncWord] = existingCollisions;
384
386 '''
387 Given an unencoded word, checks whether the word is already in
388 the in-memory dictionary. If so, does nothing and returns False;
389 Else appends the word to dict_files/dictUserRankAndWord.txt
390 with the provided rank; then returns True
391 @param newRealWord: word to be added to the user dictionary.
392 @type newRealWord: string
393 @param rankInt: frequency rank of the word. Rank 0 is most important; 1 is
394 second-most important, etc. OK to have ties.
395 '''
396
397 newRealWord = newRealWord.encode("UTF-8");
398 newEncWord = self.encodeWord(newRealWord);
399 if self.contains(newEncWord):
400 return False;
401 with open(os.path.realpath(WordCollection.USER_DICT_FILE_PATH), 'a') as fd:
402 fd.write(str(rankInt) + "\t" + newRealWord + "\n");
403
404 self.insert(newRealWord, rankInt);
405 return True;
406
407 if __name__ == "__main__":
408
409 myDict = WordCollection();
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434 myDict.createDictStructureFromFiles();
435 print "'Ne' yields: " + str(myDict.prefix_search('Ne'));
436 print "'New' yields: " + str(myDict.prefix_search('New'));
437
438
439
440
441
442
443
444
445
446