Package word_completion :: Module word_collection
[hide private]
[frames] | no frames]

Source Code for Module word_completion.word_collection

  1  #!/usr/bin/env python 
  2   
  3  import array; 
  4  import os 
  5  import sys 
  6  sys.path.append(os.path.join(os.path.dirname(__file__), "../../lib")); 
  7  from ternarytree import TernarySearchTree; 
  8   
  9  # TODO:  
 10  #  - get ternarytree.so into lib subdir during setup. Make that work for Cygwin as well. 
 11   
 12   
 13  # ABC, DEF, GHI, JKL, MNO, PQR, STUV, WXYZ 
 14  #  A    D    G    J    M    P     S    W 
 15  # 
16 -class WordCollection(TernarySearchTree):
17 ''' 18 Word lookup based on a Patricia tree (a.k.a. Radix Tree, a.k.a. Trie data structure). 19 This data structure is efficiently searchable by the prefix of words. Such a prefix search takes 20 a string prefix, and returns all dictionary words that begin with that prefix. 21 22 This class ingests rank/word pair files in a given directory. The ranks are intended 23 to be relative usage frequencies. The class manages these frequency ranks. 24 25 Public methods: 26 27 - add(word) 28 - contains(word) 29 - prefix_search(prefix) 30 - rank(word) 31 ''' 32 33 DEFAULT_USER_DICT_FILE_NAME = "dictUserRankAndWord.txt"; 34 USER_DICT_FILE_PATH = None; 35
36 - def __init__(self, dictDir=None, userDictFilePath=None):
37 ''' 38 Keep track of a Python dict mapping from word to 39 its frequency rank, of the total number of entries, and 40 the number of word files ingested from disk. 41 @param dictDir: full path to directory that contains the dictionary files. If None, a 42 built-in dictionary of 6000 words is used. 43 @type dictDir: string 44 @param userDictFilePath: full path to within a user dictionary. That file must be organized like 45 the other dictionary files. 46 @type userDictFilePath: string 47 ''' 48 super(WordCollection, self).__init__(); 49 if dictDir is None: 50 self.dictDir = os.path.join(os.path.dirname(__file__), "dict_files"); 51 else: 52 self.dictDir = dictDir; 53 54 if userDictFilePath is None: 55 WordCollection.USER_DICT_FILE_PATH = os.path.join(self.dictDir, WordCollection.DEFAULT_USER_DICT_FILE_NAME) 56 else: 57 WordCollection.USER_DICT_FILE_PATH = userDictFilePath; 58 59 self.realWordToFrequencyRanks = {}; 60 self.numEntries = 0; 61 self.numDictFilesIngested = 0; 62 self.createDictStructureFromFiles();
63
65 ''' 66 Goes through the self.dictDir directory on disk, and reads all the 67 files there. Each file must be a list of whitespace-separated 68 frequency-rank / word pairs. Assumes that self.dictDir is set to directory 69 of dictionary files. 70 @raise ValueError: if a rank in any of the files cannot be read as an integer. 71 ''' 72 for rankAndWordListFileName in os.listdir(self.dictDir): 73 with open(os.path.realpath(os.path.join(self.dictDir, rankAndWordListFileName))) as fd: 74 self.numDictFilesIngested += 1; 75 # Pull the entire rank[\t]word list into memory as one string: 76 rankAndWordLists = fd.read(); 77 for line in rankAndWordLists.splitlines(): 78 if len(line) == 0: 79 continue; 80 # Make one whitespace split to get the rank and the word: 81 try: 82 (rank, word) = line.split(None, 1); 83 except: 84 raise ValueError("Word file file %s contains a line that does not contain a numeric rank, followed by a word: '%s'" % 85 (rankAndWordListFileName, line)); 86 try: 87 rankInt = int(rank); 88 except ValueError: 89 raise ValueError("Word file %s contains a line with a non-numeric rank %s" % 90 (rankAndWordListFileName, rank)); 91 self.insert(word, rankInt);
92
93 - def addToUserDict(self, newWord, rankInt=0):
94 ''' 95 Given a word, checks whether the word is already in 96 the in-memory dictionary. If so, does nothing and returns False; 97 Else appends the word to dict_files/dictUserRankAndWord.txt 98 with the provided rank; then returns True 99 @param newWord: word to be added to the user dictionary. 100 @type newWord: string 101 @param rankInt: frequency rank of the word. Rank 0 is most important; 1 is 102 second-most important, etc. OK to have ties. 103 @type rankInt: int 104 ''' 105 # Ensure that the word is not unicode: 106 newWord = newWord.encode("UTF-8"); 107 if self.contains(newWord): 108 return False; 109 with open(os.path.realpath(WordCollection.USER_DICT_FILE_PATH), 'a') as fd: 110 fd.write(str(rankInt) + "\t" + newWord + "\n"); 111 # Update the current in-memory tree to include the word as well: 112 self.insert(newWord, rankInt); 113 return True;
114 115 116
117 - def insert(self, word, rankInt=None):
118 ''' 119 Insert one word into the word collection. 120 @param word: word to insert. 121 @type word: string 122 @param rankInt: Optionally the frequency rank of the word. If None, no rank is recorded, 123 and subsequent calls to the rank() method will fail. 124 @type rankInt: int 125 @raise ValueError: if word is not valid or empty. 126 ''' 127 if not self.contains(word): 128 self.numEntries += 1; 129 self.add(word); 130 if rankInt is not None: 131 self.realWordToFrequencyRanks[word.encode('UTF-8')] = rankInt;
132
133 - def rank(self, word):
134 ''' 135 Return the frequency rank of the given word in the collection. I is 136 an error to request the rank of a word that is not in the collection, 137 or of a word whose rank was never specified in an ingestion file or 138 as part of an insert() call. 139 @param word: the word whose frequency rank is requested. 140 @type word: string 141 @raise KeyError: if word or rank are not present in the word collection. 142 ''' 143 return self.realWordToFrequencyRanks[word.encode('UTF-8')];
144
145 - def prefix_search(self, word, cutoffRank=None):
146 ''' 147 Returns all dictionary entries that begin with the string word. 148 If the optional cutoffRank is specified, it limits the length of the 149 returned list to include only the top cutoffRank words. Example, if 150 cutoffRank=5, only the five most highly ranked dictionary entries 151 are returned. Also, if cutoffRank is specified, the returned list 152 is sorted by decreasing word rank. If cutoffRank is not specified, 153 or is None, the returned list is unsorted. 154 @param word: prefix to search by. 155 @type word: string. 156 @param cutoffRank: Number of most highly ranked dictionary entries to return in rank-sorted order. 157 @type cutoffRank: int 158 ''' 159 160 if cutoffRank is not None: 161 if not isinstance(cutoffRank, int): 162 raise TypeError("Parameter cutoffRank for prefix_search must be an integer."); 163 164 matchingWords = super(WordCollection, self).prefix_search(word); 165 # The underlying tree traversal implementation seems to return 166 # all the words in the tree that start with the first letter of 'word'. 167 # Keep only the ones that really start with word: 168 finalWords = []; 169 for candidate in matchingWords: 170 if self.startsWith(candidate,word): 171 finalWords.append(candidate); 172 if cutoffRank is not None: 173 # sort by rank: 174 finalWords.sort(key=self.rank); 175 return finalWords[:cutoffRank] 176 return finalWords;
177
178 - def startsWith(self, word, prefix):
179 ''' 180 True if word starts with, or is equal to prefix. Else False. 181 @param word: word to examine. 182 @type word: string. 183 @param prefix: string that word is required to start with for a return of True 184 @type prefix: string 185 ''' 186 if len(prefix) > len(word): 187 return False; 188 wordFoundFrag = word[:len(prefix)]; 189 return word[:len(prefix)] == prefix;
190
191 - def __len__(self):
192 ''' 193 Return number of words in the collection. 194 ''' 195 return self.numEntries;
196 197 # --------------------------------------------- Subclass TelPadEncodedWordCollection ----------------- 198
199 -class TelPadEncodedWordCollection(WordCollection):
200 ''' 201 Instances behave as the superclass WordCollection. However, all added words 202 are encoded as if entered via a telephone pad. Each letter group of the telephone 203 pad is represented by its first letter. Example: "and" --> "amd" (phone buttons 1,5,2). 204 <p> 205 The class can thus be used to search words by entering for each real word's letters 'c' 206 the first letter of the telephone pad that contains 'c'. For word input, clients need 207 not concern themselves with this encoding. That transformation occurs automatically. 208 <p> 209 However, calls to search_prefix() or contains() must encode the real words with the 210 encoded version. Thus, instead of calling myColl.contains("and"), the client would 211 call myColl.contains("amd"). Method encodeWord() takes a real word and returns the 212 encoded version. 213 <p> 214 Method search_prefix() will usually contain a larger number of 'remaining possible words' 215 than a regular WordCollection. This is because the mapping from encoded to real words is 216 one-to-many. 217 ''' 218 219 symbolToEnc = { 220 'ABC' : 'a', 221 'DEF' : 'd', 222 'GHI' : 'g', 223 'JKL' : 'j', 224 'MNO' : 'm', 225 'PQR' : 'p', 226 'STUV': 's', 227 'WXYZ': 'w' 228 } 229 230 encToSymbol = {'a' : 'ABC', 231 'd' : 'DEF', 232 'g' : 'GHI', 233 'j' : 'JKL', 234 'm' : 'MNO', 235 'p' : 'PQR', 236 's' : 'STUV', 237 'w' : 'WXYZ' 238 } 239 240 alphabet = { 241 'a' : 'a', 242 'b' : 'a', 243 'c' : 'a', 244 'd' : 'd', 245 'e' : 'd', 246 'f' : 'd', 247 'g' : 'g', 248 'h' : 'g', 249 'i' : 'g', 250 'j' : 'j', 251 'k' : 'j', 252 'l' : 'j', 253 'm' : 'm', 254 'n' : 'm', 255 'o' : 'm', 256 'p' : 'p', 257 'q' : 'p', 258 'r' : 'p', 259 's' : 's', 260 't' : 's', 261 'u' : 's', 262 'v' : 's', 263 'w' : 'w', 264 'x' : 'w', 265 'y' : 'w', 266 'z' : 'w' 267 } 268
269 - def __init__(self):
270 ''' 271 Maintain a data structure that maps each encoded word 272 to all the possible equivalent real words. We call these 273 multiple words 'collisions.' 274 ''' 275 self.encWordToRealWords = {}; 276 super(TelPadEncodedWordCollection, self).__init__();
277
278 - def prefix_search(self, encWord):
279 ''' 280 Prefix search operates as for the WordCollection superclass, but takes 281 as input a telephone pad encoded prefix. Returns an array of all real 282 words that could complete the given prefix. 283 @param encWord: the encoded prefix 284 @type encWord: string 285 @raise ValueError: if the mapping from encoded words to collisions is corrupted. Never caused by caller. 286 ''' 287 if len(encWord) == 0: 288 return []; 289 # Get the normal Patricia tree matching set, which consists of 290 # encWords: 291 encMatches = super(TelPadEncodedWordCollection, self).prefix_search(encWord); 292 293 # But each encoded word, might match to multiple real words. Build 294 # that larger list: 295 realWordMatches = []; 296 for encWord in encMatches: 297 try: 298 realWordCollisions = self.encWordToRealWords[encWord]; 299 except KeyError: 300 raise ValueError("An encoded tel pad word did not have a mapping to at least one real word: %s" % encWord); 301 realWordMatches.extend(realWordCollisions); 302 return realWordMatches;
303
304 - def encodeTelPadLabel(self, label):
305 ''' 306 Given a string label as seen on the JBoard button pad, 307 return the single letter that represents the group of 308 label chars in this class. Ex: "ABC" returns symbolToEnc["ABC"] == 'a'. 309 @param label: Group of chars on a JBoard (more or less telephone pad): 310 @type label: string 311 @raise KeyError: if passed-in button label is not a true button label. 312 ''' 313 return TelPadEncodedWordCollection.symbolToEnc[label];
314
315 - def decodeTelPadLabel(self, encLetter):
316 ''' 317 Given the encoding of a button label, return the 318 original label. Ex.: 'a' ==> 'ABC', 's' ==> 'STUV' 319 @param encLetter: label encoding. 320 @type encLetter: string 321 @raise KeyError: if passed-in letter is not an encoded label. 322 ''' 323 return TelPadEncodedWordCollection.encToSymbol[encLetter];
324
325 - def encodeWord(self, word):
326 ''' 327 Given a real word, return its telephone pad encoded equivalent. 328 @param word: the real word to encode. 329 @type word: string 330 @return: the encoded equivalent string. 331 ''' 332 translation = array.array('c', word); 333 for i, char in enumerate(word.lower()): 334 try: 335 translation[i] = TelPadEncodedWordCollection.alphabet[char]; 336 except KeyError: 337 # Char is not an alpha char (e.g. digit, or apostrophe): 338 translation[i] = char; 339 return translation.tostring();
340 341
342 - def insert(self, newRealWord, newRankInt):
343 ''' 344 Takes a real, that is unencoded word, encodes it, and 345 inserts it into the (in-memory) tree. Updates the mapping from encoded 346 words to their collisions. 347 @param newRealWord: the unencoded word to insert. 348 @type newRealWord: string 349 @param newRankInt: the new word's frequency rank. 350 @type newRankInt: int 351 @raise ValueError: if the encoded-word to collisions data structure is corrupted. Not caused by caller. 352 ''' 353 newEncWord = self.encodeWord(newRealWord); 354 super(TelPadEncodedWordCollection, self).insert(newEncWord); 355 self.realWordToFrequencyRanks[newRealWord] = newRankInt; 356 try: 357 existingCollisions = self.encWordToRealWords[newEncWord]; 358 except KeyError: 359 existingCollisions = []; 360 # Insert the new real word into the existing list of collisions, 361 # since newEncWord maps to newRealWord. Preserving high-to-low ranking order: 362 if len(existingCollisions) == 0: 363 existingCollisions.append(newRealWord); 364 self.encWordToRealWords[newEncWord] = [newRealWord]; 365 else: 366 for (pos, realCollisionWord) in enumerate(existingCollisions): 367 368 # If this same realWord was inserted before then we are done: 369 if (realCollisionWord == newRealWord): 370 return; 371 372 # Get rank of the existing collision realWord: 373 try: 374 realCollisionWordRank = self.realWordToFrequencyRanks[realCollisionWord]; 375 except KeyError: 376 raise ValueError("Data structure that maps telephone-pad-encoded words to lists of true words should contain %s, but does not." % realCollisionWord); 377 if newRankInt <= realCollisionWordRank: 378 existingCollisions.insert(pos, newRealWord); 379 self.encWordToRealWords[newEncWord] = existingCollisions; 380 return; 381 382 existingCollisions.append(newRealWord); 383 self.encWordToRealWords[newEncWord] = existingCollisions;
384
385 - def addToUserDict(self, newRealWord, rankInt=0):
386 ''' 387 Given an unencoded word, checks whether the word is already in 388 the in-memory dictionary. If so, does nothing and returns False; 389 Else appends the word to dict_files/dictUserRankAndWord.txt 390 with the provided rank; then returns True 391 @param newRealWord: word to be added to the user dictionary. 392 @type newRealWord: string 393 @param rankInt: frequency rank of the word. Rank 0 is most important; 1 is 394 second-most important, etc. OK to have ties. 395 ''' 396 # Ensure that the word is not unicode: 397 newRealWord = newRealWord.encode("UTF-8"); 398 newEncWord = self.encodeWord(newRealWord); 399 if self.contains(newEncWord): 400 return False; 401 with open(os.path.realpath(WordCollection.USER_DICT_FILE_PATH), 'a') as fd: 402 fd.write(str(rankInt) + "\t" + newRealWord + "\n"); 403 # Update the current in-memory tree to include the word as well: 404 self.insert(newRealWord, rankInt); 405 return True;
406 407 if __name__ == "__main__": 408 409 myDict = WordCollection(); 410 411 # # Test word-to-tie-alphabet mapping: 412 # print myDict.encodeWord('andreas'); 413 # 414 # Test underlying Patricia tree facility: 415 # dictTree = TernarySearchTree() 416 # dictTree.add("hello"); 417 # dictTree.add("hell"); 418 # print dictTree.prefix_search("he") 419 # dictTree.add("hillel") 420 # print dictTree.prefix_search("he") # bug: returns hello,hell, and hillel 421 422 # Test WordCollection ingestion from file: 423 # myDict.createDictStructureFromFiles(); 424 # print str(myDict.contains("hello")); 425 # print str(myDict.prefix_search("hell")); 426 # myDict.add("hillel"); 427 # print str(myDict.prefix_search("he")) 428 # print str(myDict.prefix_search("he", cutoffRank=5)); 429 # print str(myDict.prefix_search("hel", cutoffRank=5)); 430 # print "Rank of 'hello': " + str(myDict.rank("hello")); 431 # print "Num entries: " + str(len(myDict)); 432 433 # Test Unicode issues: 434 myDict.createDictStructureFromFiles(); 435 print "'Ne' yields: " + str(myDict.prefix_search('Ne')); 436 print "'New' yields: " + str(myDict.prefix_search('New')); 437 438 439 # # Test tel pad dictionary: 440 # myTelDict = TelPadEncodedWordCollection(); 441 # print myTelDict.prefix_search(myTelDict.encodeWord("hello")); 442 # print myTelDict.prefix_search("dgspjaw"); 443 # print str(len(myTelDict)); 444 # print str(myTelDict.size) 445 # print "Num of files: " + str(myTelDict.numDictFilesIngested); 446