1 | """Spelling Corrector. |
---|
2 | |
---|
3 | Copyright 2007 Peter Norvig. |
---|
4 | Open source code under MIT license: http://www.opensource.org/licenses/mit-license.php |
---|
5 | """ |
---|
6 | |
---|
7 | import re, collections |
---|
8 | |
---|
9 | def words(text): return re.findall('[a-z]+', text.lower()) |
---|
10 | |
---|
11 | def train(features): |
---|
12 | model = collections.defaultdict(lambda: 1) |
---|
13 | for f in features: |
---|
14 | model[f] += 1 |
---|
15 | return model |
---|
16 | |
---|
17 | NWORDS = train(words(file('big.txt').read())) |
---|
18 | |
---|
19 | alphabet = 'abcdefghijklmnopqrstuvwxyz' |
---|
20 | |
---|
21 | def edits1(word): |
---|
22 | s = [(word[:i], word[i:]) for i in range(len(word) + 1)] |
---|
23 | deletes = [a + b[1:] for a, b in s if b] |
---|
24 | transposes = [a + b[1] + b[0] + b[2:] for a, b in s if len(b)>1] |
---|
25 | replaces = [a + c + b[1:] for a, b in s for c in alphabet if b] |
---|
26 | inserts = [a + c + b for a, b in s for c in alphabet] |
---|
27 | return set(deletes + transposes + replaces + inserts) |
---|
28 | |
---|
29 | def known_edits2(word): |
---|
30 | return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS) |
---|
31 | |
---|
32 | def known(words): return set(w for w in words if w in NWORDS) |
---|
33 | |
---|
34 | def correct(word): |
---|
35 | candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word] |
---|
36 | return max(candidates, key=NWORDS.get) |
---|
37 | |
---|
38 | ################ Testing code from here on ################ |
---|
39 | |
---|
40 | def spelltest(tests, bias=None, verbose=False): |
---|
41 | import time |
---|
42 | n, bad, unknown, start = 0, 0, 0, time.clock() |
---|
43 | if bias: |
---|
44 | for target in tests: NWORDS[target] += bias |
---|
45 | for target,wrongs in tests.items(): |
---|
46 | for wrong in wrongs.split(): |
---|
47 | n += 1 |
---|
48 | w = correct(wrong) |
---|
49 | if w!=target: |
---|
50 | bad += 1 |
---|
51 | unknown += (target not in NWORDS) |
---|
52 | if verbose: |
---|
53 | print 'correct(%r) => %r (%d); expected %r (%d)' % ( |
---|
54 | wrong, w, NWORDS[w], target, NWORDS[target]) |
---|
55 | return dict(bad=bad, n=n, bias=bias, pct=int(100. - 100.*bad/n), |
---|
56 | unknown=unknown, secs=int(time.clock()-start) ) |
---|
57 | |
---|
58 | tests1 = { 'access': 'acess', 'accessing': 'accesing', 'accommodation': |
---|
59 | 'accomodation acommodation acomodation', 'account': 'acount', 'address': |
---|
60 | 'adress adres', 'addressable': 'addresable', 'arranged': 'aranged arrainged', |
---|
61 | 'arrangeing': 'aranging', 'arrangement': 'arragment', 'articles': 'articals', |
---|
62 | 'aunt': 'annt anut arnt', 'auxiliary': 'auxillary', 'available': 'avaible', |
---|
63 | 'awful': 'awfall afful', 'basically': 'basicaly', 'beginning': 'begining', |
---|
64 | 'benefit': 'benifit', 'benefits': 'benifits', 'between': 'beetween', 'bicycle': |
---|
65 | 'bicycal bycicle bycycle', 'biscuits': |
---|
66 | 'biscits biscutes biscuts bisquits buiscits buiscuts', 'built': 'biult', |
---|
67 | 'cake': 'cak', 'career': 'carrer', |
---|
68 | 'cemetery': 'cemetary semetary', 'centrally': 'centraly', 'certain': 'cirtain', |
---|
69 | 'challenges': 'chalenges chalenges', 'chapter': 'chaper chaphter chaptur', |
---|
70 | 'choice': 'choise', 'choosing': 'chosing', 'clerical': 'clearical', |
---|
71 | 'committee': 'comittee', 'compare': 'compair', 'completely': 'completly', |
---|
72 | 'consider': 'concider', 'considerable': 'conciderable', 'contented': |
---|
73 | 'contenpted contende contended contentid', 'curtains': |
---|
74 | 'cartains certans courtens cuaritains curtans curtians curtions', 'decide': 'descide', 'decided': |
---|
75 | 'descided', 'definitely': 'definately difinately', 'definition': 'defenition', |
---|
76 | 'definitions': 'defenitions', 'description': 'discription', 'desiccate': |
---|
77 | 'desicate dessicate dessiccate', 'diagrammatically': 'diagrammaticaally', |
---|
78 | 'different': 'diffrent', 'driven': 'dirven', 'ecstasy': 'exstacy ecstacy', |
---|
79 | 'embarrass': 'embaras embarass', 'establishing': 'astablishing establising', |
---|
80 | 'experience': 'experance experiance', 'experiences': 'experances', 'extended': |
---|
81 | 'extented', 'extremely': 'extreamly', 'fails': 'failes', 'families': 'familes', |
---|
82 | 'february': 'febuary', 'further': 'futher', 'gallery': 'galery gallary gallerry gallrey', |
---|
83 | 'hierarchal': 'hierachial', 'hierarchy': 'hierchy', 'inconvenient': |
---|
84 | 'inconvienient inconvient inconvinient', 'independent': 'independant independant', |
---|
85 | 'initial': 'intial', 'initials': 'inetials inistals initails initals intials', |
---|
86 | 'juice': 'guic juce jucie juise juse', 'latest': 'lates latets latiest latist', |
---|
87 | 'laugh': 'lagh lauf laught lugh', 'level': 'leval', |
---|
88 | 'levels': 'levals', 'liaison': 'liaision liason', 'lieu': 'liew', 'literature': |
---|
89 | 'litriture', 'loans': 'lones', 'locally': 'localy', 'magnificent': |
---|
90 | 'magnificnet magificent magnifcent magnifecent magnifiscant magnifisent magnificant', |
---|
91 | 'management': 'managment', 'meant': 'ment', 'minuscule': 'miniscule', |
---|
92 | 'minutes': 'muinets', 'monitoring': 'monitering', 'necessary': |
---|
93 | 'neccesary necesary neccesary necassary necassery neccasary', 'occurrence': |
---|
94 | 'occurence occurence', 'often': 'ofen offen offten ofton', 'opposite': |
---|
95 | 'opisite oppasite oppesite oppisit oppisite opposit oppossite oppossitte', 'parallel': |
---|
96 | 'paralel paralell parrallel parralell parrallell', 'particular': 'particulaur', |
---|
97 | 'perhaps': 'perhapse', 'personnel': 'personnell', 'planned': 'planed', 'poem': |
---|
98 | 'poame', 'poems': 'poims pomes', 'poetry': 'poartry poertry poetre poety powetry', |
---|
99 | 'position': 'possition', 'possible': 'possable', 'pretend': |
---|
100 | 'pertend protend prtend pritend', 'problem': 'problam proble promblem proplen', |
---|
101 | 'pronunciation': 'pronounciation', 'purple': 'perple perpul poarple', |
---|
102 | 'questionnaire': 'questionaire', 'really': 'realy relley relly', 'receipt': |
---|
103 | 'receit receite reciet recipt', 'receive': 'recieve', 'refreshment': |
---|
104 | 'reafreshment refreshmant refresment refressmunt', 'remember': 'rember remeber rememmer rermember', |
---|
105 | 'remind': 'remine remined', 'scarcely': 'scarcly scarecly scarely scarsely', |
---|
106 | 'scissors': 'scisors sissors', 'separate': 'seperate', |
---|
107 | 'singular': 'singulaur', 'someone': 'somone', 'sources': 'sorces', 'southern': |
---|
108 | 'southen', 'special': 'speaical specail specal speical', 'splendid': |
---|
109 | 'spledid splended splened splended', 'standardizing': 'stanerdizing', 'stomach': |
---|
110 | 'stomac stomache stomec stumache', 'supersede': 'supercede superceed', 'there': 'ther', |
---|
111 | 'totally': 'totaly', 'transferred': 'transfred', 'transportability': |
---|
112 | 'transportibility', 'triangular': 'triangulaur', 'understand': 'undersand undistand', |
---|
113 | 'unexpected': 'unexpcted unexpeted unexspected', 'unfortunately': |
---|
114 | 'unfortunatly', 'unique': 'uneque', 'useful': 'usefull', 'valuable': 'valubale valuble', |
---|
115 | 'variable': 'varable', 'variant': 'vairiant', 'various': 'vairious', |
---|
116 | 'visited': 'fisited viseted vistid vistied', 'visitors': 'vistors', |
---|
117 | 'voluntary': 'volantry', 'voting': 'voteing', 'wanted': 'wantid wonted', |
---|
118 | 'whether': 'wether', 'wrote': 'rote wote'} |
---|
119 | |
---|
120 | tests2 = {'forbidden': 'forbiden', 'decisions': 'deciscions descisions', |
---|
121 | 'supposedly': 'supposidly', 'embellishing': 'embelishing', 'technique': |
---|
122 | 'tecnique', 'permanently': 'perminantly', 'confirmation': 'confermation', |
---|
123 | 'appointment': 'appoitment', 'progression': 'progresion', 'accompanying': |
---|
124 | 'acompaning', 'applicable': 'aplicable', 'regained': 'regined', 'guidelines': |
---|
125 | 'guidlines', 'surrounding': 'serounding', 'titles': 'tittles', 'unavailable': |
---|
126 | 'unavailble', 'advantageous': 'advantageos', 'brief': 'brif', 'appeal': |
---|
127 | 'apeal', 'consisting': 'consisiting', 'clerk': 'cleark clerck', 'component': |
---|
128 | 'componant', 'favourable': 'faverable', 'separation': 'seperation', 'search': |
---|
129 | 'serch', 'receive': 'recieve', 'employees': 'emploies', 'prior': 'piror', |
---|
130 | 'resulting': 'reulting', 'suggestion': 'sugestion', 'opinion': 'oppinion', |
---|
131 | 'cancellation': 'cancelation', 'criticism': 'citisum', 'useful': 'usful', |
---|
132 | 'humour': 'humor', 'anomalies': 'anomolies', 'would': 'whould', 'doubt': |
---|
133 | 'doupt', 'examination': 'eximination', 'therefore': 'therefoe', 'recommend': |
---|
134 | 'recomend', 'separated': 'seperated', 'successful': 'sucssuful succesful', |
---|
135 | 'apparent': 'apparant', 'occurred': 'occureed', 'particular': 'paerticulaur', |
---|
136 | 'pivoting': 'pivting', 'announcing': 'anouncing', 'challenge': 'chalange', |
---|
137 | 'arrangements': 'araingements', 'proportions': 'proprtions', 'organized': |
---|
138 | 'oranised', 'accept': 'acept', 'dependence': 'dependance', 'unequalled': |
---|
139 | 'unequaled', 'numbers': 'numbuers', 'sense': 'sence', 'conversely': |
---|
140 | 'conversly', 'provide': 'provid', 'arrangement': 'arrangment', |
---|
141 | 'responsibilities': 'responsiblities', 'fourth': 'forth', 'ordinary': |
---|
142 | 'ordenary', 'description': 'desription descvription desacription', |
---|
143 | 'inconceivable': 'inconcievable', 'data': 'dsata', 'register': 'rgister', |
---|
144 | 'supervision': 'supervison', 'encompassing': 'encompasing', 'negligible': |
---|
145 | 'negligable', 'allow': 'alow', 'operations': 'operatins', 'executed': |
---|
146 | 'executted', 'interpretation': 'interpritation', 'hierarchy': 'heiarky', |
---|
147 | 'indeed': 'indead', 'years': 'yesars', 'through': 'throut', 'committee': |
---|
148 | 'committe', 'inquiries': 'equiries', 'before': 'befor', 'continued': |
---|
149 | 'contuned', 'permanent': 'perminant', 'choose': 'chose', 'virtually': |
---|
150 | 'vertually', 'correspondence': 'correspondance', 'eventually': 'eventully', |
---|
151 | 'lonely': 'lonley', 'profession': 'preffeson', 'they': 'thay', 'now': 'noe', |
---|
152 | 'desperately': 'despratly', 'university': 'unversity', 'adjournment': |
---|
153 | 'adjurnment', 'possibilities': 'possablities', 'stopped': 'stoped', 'mean': |
---|
154 | 'meen', 'weighted': 'wagted', 'adequately': 'adequattly', 'shown': 'hown', |
---|
155 | 'matrix': 'matriiix', 'profit': 'proffit', 'encourage': 'encorage', 'collate': |
---|
156 | 'colate', 'disaggregate': 'disaggreagte disaggreaget', 'receiving': |
---|
157 | 'recieving reciving', 'proviso': 'provisoe', 'umbrella': 'umberalla', 'approached': |
---|
158 | 'aproached', 'pleasant': 'plesent', 'difficulty': 'dificulty', 'appointments': |
---|
159 | 'apointments', 'base': 'basse', 'conditioning': 'conditining', 'earliest': |
---|
160 | 'earlyest', 'beginning': 'begining', 'universally': 'universaly', |
---|
161 | 'unresolved': 'unresloved', 'length': 'lengh', 'exponentially': |
---|
162 | 'exponentualy', 'utilized': 'utalised', 'set': 'et', 'surveys': 'servays', |
---|
163 | 'families': 'familys', 'system': 'sysem', 'approximately': 'aproximatly', |
---|
164 | 'their': 'ther', 'scheme': 'scheem', 'speaking': 'speeking', 'repetitive': |
---|
165 | 'repetative', 'inefficient': 'ineffiect', 'geneva': 'geniva', 'exactly': |
---|
166 | 'exsactly', 'immediate': 'imediate', 'appreciation': 'apreciation', 'luckily': |
---|
167 | 'luckeley', 'eliminated': 'elimiated', 'believe': 'belive', 'appreciated': |
---|
168 | 'apreciated', 'readjusted': 'reajusted', 'were': 'wer where', 'feeling': |
---|
169 | 'fealing', 'and': 'anf', 'false': 'faulse', 'seen': 'seeen', 'interrogating': |
---|
170 | 'interogationg', 'academically': 'academicly', 'relatively': 'relativly relitivly', |
---|
171 | 'traditionally': 'traditionaly', 'studying': 'studing', |
---|
172 | 'majority': 'majorty', 'build': 'biuld', 'aggravating': 'agravating', |
---|
173 | 'transactions': 'trasactions', 'arguing': 'aurguing', 'sheets': 'sheertes', |
---|
174 | 'successive': 'sucsesive sucessive', 'segment': 'segemnt', 'especially': |
---|
175 | 'especaily', 'later': 'latter', 'senior': 'sienior', 'dragged': 'draged', |
---|
176 | 'atmosphere': 'atmospher', 'drastically': 'drasticaly', 'particularly': |
---|
177 | 'particulary', 'visitor': 'vistor', 'session': 'sesion', 'continually': |
---|
178 | 'contually', 'availability': 'avaiblity', 'busy': 'buisy', 'parameters': |
---|
179 | 'perametres', 'surroundings': 'suroundings seroundings', 'employed': |
---|
180 | 'emploied', 'adequate': 'adiquate', 'handle': 'handel', 'means': 'meens', |
---|
181 | 'familiar': 'familer', 'between': 'beeteen', 'overall': 'overal', 'timing': |
---|
182 | 'timeing', 'committees': 'comittees commitees', 'queries': 'quies', |
---|
183 | 'econometric': 'economtric', 'erroneous': 'errounous', 'decides': 'descides', |
---|
184 | 'reference': 'refereence refference', 'intelligence': 'inteligence', |
---|
185 | 'edition': 'ediion ediition', 'are': 'arte', 'apologies': 'appologies', |
---|
186 | 'thermawear': 'thermawere thermawhere', 'techniques': 'tecniques', |
---|
187 | 'voluntary': 'volantary', 'subsequent': 'subsequant subsiquent', 'currently': |
---|
188 | 'curruntly', 'forecast': 'forcast', 'weapons': 'wepons', 'routine': 'rouint', |
---|
189 | 'neither': 'niether', 'approach': 'aproach', 'available': 'availble', |
---|
190 | 'recently': 'reciently', 'ability': 'ablity', 'nature': 'natior', |
---|
191 | 'commercial': 'comersial', 'agencies': 'agences', 'however': 'howeverr', |
---|
192 | 'suggested': 'sugested', 'career': 'carear', 'many': 'mony', 'annual': |
---|
193 | 'anual', 'according': 'acording', 'receives': 'recives recieves', |
---|
194 | 'interesting': 'intresting', 'expense': 'expence', 'relevant': |
---|
195 | 'relavent relevaant', 'table': 'tasble', 'throughout': 'throuout', 'conference': |
---|
196 | 'conferance', 'sensible': 'sensable', 'described': 'discribed describd', |
---|
197 | 'union': 'unioun', 'interest': 'intrest', 'flexible': 'flexable', 'refered': |
---|
198 | 'reffered', 'controlled': 'controled', 'sufficient': 'suficient', |
---|
199 | 'dissension': 'desention', 'adaptable': 'adabtable', 'representative': |
---|
200 | 'representitive', 'irrelevant': 'irrelavent', 'unnecessarily': 'unessasarily', |
---|
201 | 'applied': 'upplied', 'apologised': 'appologised', 'these': 'thees thess', |
---|
202 | 'choices': 'choises', 'will': 'wil', 'procedure': 'proceduer', 'shortened': |
---|
203 | 'shortend', 'manually': 'manualy', 'disappointing': 'dissapoiting', |
---|
204 | 'excessively': 'exessively', 'comments': 'coments', 'containing': 'containg', |
---|
205 | 'develop': 'develope', 'credit': 'creadit', 'government': 'goverment', |
---|
206 | 'acquaintances': 'aquantences', 'orientated': 'orentated', 'widely': 'widly', |
---|
207 | 'advise': 'advice', 'difficult': 'dificult', 'investigated': 'investegated', |
---|
208 | 'bonus': 'bonas', 'conceived': 'concieved', 'nationally': 'nationaly', |
---|
209 | 'compared': 'comppared compased', 'moving': 'moveing', 'necessity': |
---|
210 | 'nessesity', 'opportunity': 'oppertunity oppotunity opperttunity', 'thoughts': |
---|
211 | 'thorts', 'equalled': 'equaled', 'variety': 'variatry', 'analysis': |
---|
212 | 'analiss analsis analisis', 'patterns': 'pattarns', 'qualities': 'quaties', 'easily': |
---|
213 | 'easyly', 'organization': 'oranisation oragnisation', 'the': 'thw hte thi', |
---|
214 | 'corporate': 'corparate', 'composed': 'compossed', 'enormously': 'enomosly', |
---|
215 | 'financially': 'financialy', 'functionally': 'functionaly', 'discipline': |
---|
216 | 'disiplin', 'announcement': 'anouncement', 'progresses': 'progressess', |
---|
217 | 'except': 'excxept', 'recommending': 'recomending', 'mathematically': |
---|
218 | 'mathematicaly', 'source': 'sorce', 'combine': 'comibine', 'input': 'inut', |
---|
219 | 'careers': 'currers carrers', 'resolved': 'resoved', 'demands': 'diemands', |
---|
220 | 'unequivocally': 'unequivocaly', 'suffering': 'suufering', 'immediately': |
---|
221 | 'imidatly imediatly', 'accepted': 'acepted', 'projects': 'projeccts', |
---|
222 | 'necessary': 'necasery nessasary nessisary neccassary', 'journalism': |
---|
223 | 'journaism', 'unnecessary': 'unessessay', 'night': 'nite', 'output': |
---|
224 | 'oputput', 'security': 'seurity', 'essential': 'esential', 'beneficial': |
---|
225 | 'benificial benficial', 'explaining': 'explaning', 'supplementary': |
---|
226 | 'suplementary', 'questionnaire': 'questionare', 'employment': 'empolyment', |
---|
227 | 'proceeding': 'proceding', 'decision': 'descisions descision', 'per': 'pere', |
---|
228 | 'discretion': 'discresion', 'reaching': 'reching', 'analysed': 'analised', |
---|
229 | 'expansion': 'expanion', 'although': 'athough', 'subtract': 'subtrcat', |
---|
230 | 'analysing': 'aalysing', 'comparison': 'comparrison', 'months': 'monthes', |
---|
231 | 'hierarchal': 'hierachial', 'misleading': 'missleading', 'commit': 'comit', |
---|
232 | 'auguments': 'aurgument', 'within': 'withing', 'obtaining': 'optaning', |
---|
233 | 'accounts': 'acounts', 'primarily': 'pimarily', 'operator': 'opertor', |
---|
234 | 'accumulated': 'acumulated', 'extremely': 'extreemly', 'there': 'thear', |
---|
235 | 'summarys': 'sumarys', 'analyse': 'analiss', 'understandable': |
---|
236 | 'understadable', 'safeguard': 'safegaurd', 'consist': 'consisit', |
---|
237 | 'declarations': 'declaratrions', 'minutes': 'muinutes muiuets', 'associated': |
---|
238 | 'assosiated', 'accessibility': 'accessability', 'examine': 'examin', |
---|
239 | 'surveying': 'servaying', 'politics': 'polatics', 'annoying': 'anoying', |
---|
240 | 'again': 'agiin', 'assessing': 'accesing', 'ideally': 'idealy', 'scrutinized': |
---|
241 | 'scrutiniesed', 'simular': 'similar', 'personnel': 'personel', 'whereas': |
---|
242 | 'wheras', 'when': 'whn', 'geographically': 'goegraphicaly', 'gaining': |
---|
243 | 'ganing', 'requested': 'rquested', 'separate': 'seporate', 'students': |
---|
244 | 'studens', 'prepared': 'prepaired', 'generated': 'generataed', 'graphically': |
---|
245 | 'graphicaly', 'suited': 'suted', 'variable': 'varible vaiable', 'building': |
---|
246 | 'biulding', 'required': 'reequired', 'necessitates': 'nessisitates', |
---|
247 | 'together': 'togehter', 'profits': 'proffits'} |
---|
248 | |
---|
249 | if __name__ == '__main__': |
---|
250 | print spelltest(tests1) |
---|
251 | |
---|
252 | |
---|
253 | |
---|