Package word_completion :: Module grab6000FrequentWords
[hide private]
[frames] | no frames]

Source Code for Module word_completion.grab6000FrequentWords

 1  #!/usr/env python 
 2   
 3  import os 
 4  import re 
 5  import urllib 
 6   
 7  # This is a throw-away module. Used once to pull a 
 8  # dictionary of the highest ranked 6000 English words 
 9  # from search queries. Hard-coded paths. Would need 
10  # generalization if re-used. 
11   
12 -def pullFirst1000FromWeb():
13 f = urllib.urlopen("http://www.insightin.com/esl/1000.php") 14 page = f.read() 15 f.close() 16 #outFile = os.open(os.getenv("HOME") + "/dldev/EclipseWorkspaces/JBoard/src/JBoard/dict_files/dict" + str(i) + ".html", 'w') 17 outFileName = "C:/Users/paepcke/dldev/EclipseWorkspaces/JBoard/src/JBoard/dict_files/dict1000.html"; 18 outFile = open(outFileName, "w") 19 outFile.write(page) 20 outFile.close(); 21 print "Did " + str(i)
22
23 -def pullRestFromWeb():
24 for i in range(2000,6100,100): 25 f = urllib.urlopen("http://www.insightin.com/esl/" + str(i) + ".php") 26 page = f.read() 27 f.close() 28 #outFile = os.open(os.getenv("HOME") + "/dldev/EclipseWorkspaces/JBoard/src/JBoard/dict_files/dict" + str(i) + ".html", 'w') 29 outFileName = "C:/Users/paepcke/dldev/EclipseWorkspaces/JBoard/src/JBoard/dict_files/dict" + str(i) + ".html" 30 outFile = open(outFileName, "w") 31 outFile.write(page) 32 outFile.close(); 33 print "Did " + str(i)
34
35 -def pullOutNumbers():
36 matcher = re.compile('word=([^\n]*)\n&rank=([\d]*)') 37 for i in range(2100,6100,100): 38 fIn = open("C:/Users/paepcke/dldev/EclipseWorkspaces/JBoard/src/JBoard/dict_files/dict" + str(i) + ".html"); 39 fOut = open("C:/Users/paepcke/dldev/EclipseWorkspaces/JBoard/src/JBoard/dict_files/dict" + str(i) + "RankAndWord.txt", "w"); 40 page = fIn.read(); 41 for match in re.finditer(matcher, page): 42 fOut.write(match.group(2) + "\t" + match.group(1) + "\n"); 43 fOut.close() 44 fIn.close() 45 print "Done " + str(i)
46 47 if __name__ == "__main__": 48 pullFirstFromWeb(); 49 pullRestFromWeb(); 50 pullOutNumbers(); 51