1
2
3 import os
4 import re
5 import urllib
6
7
8
9
10
11
13 f = urllib.urlopen("http://www.insightin.com/esl/1000.php")
14 page = f.read()
15 f.close()
16
17 outFileName = "C:/Users/paepcke/dldev/EclipseWorkspaces/JBoard/src/JBoard/dict_files/dict1000.html";
18 outFile = open(outFileName, "w")
19 outFile.write(page)
20 outFile.close();
21 print "Did " + str(i)
22
24 for i in range(2000,6100,100):
25 f = urllib.urlopen("http://www.insightin.com/esl/" + str(i) + ".php")
26 page = f.read()
27 f.close()
28
29 outFileName = "C:/Users/paepcke/dldev/EclipseWorkspaces/JBoard/src/JBoard/dict_files/dict" + str(i) + ".html"
30 outFile = open(outFileName, "w")
31 outFile.write(page)
32 outFile.close();
33 print "Did " + str(i)
34
36 matcher = re.compile('word=([^\n]*)\n&rank=([\d]*)')
37 for i in range(2100,6100,100):
38 fIn = open("C:/Users/paepcke/dldev/EclipseWorkspaces/JBoard/src/JBoard/dict_files/dict" + str(i) + ".html");
39 fOut = open("C:/Users/paepcke/dldev/EclipseWorkspaces/JBoard/src/JBoard/dict_files/dict" + str(i) + "RankAndWord.txt", "w");
40 page = fIn.read();
41 for match in re.finditer(matcher, page):
42 fOut.write(match.group(2) + "\t" + match.group(1) + "\n");
43 fOut.close()
44 fIn.close()
45 print "Done " + str(i)
46
47 if __name__ == "__main__":
48 pullFirstFromWeb();
49 pullRestFromWeb();
50 pullOutNumbers();
51