Context Navigation

Back to en/AdvancedNlpCourse2018/AutomaticCorrection

AutomaticCorrection: evalpunct_robust.py

File evalpunct_robust.py, 1.8 KB (added by Ales Horak, 5 years ago)

Line
1	#!/usr/bin/python
2
3	# ACHTUNG loads both files into memory
4
5	verbose = False
6	#verbose = True
7
8	verbose_window = 10 # characters left+right
9
10	import sys
11
12	gold = open(sys.argv[1]).read().decode('utf8')
13	test = open(sys.argv[2]).read().decode('utf8')
14
15	no_found, no_gold, no_correct = 0, 0, 0
16	i, gi = 0, 0
17
18	while True:
19	# if i % 10000 == 0: print i
20	if i >= len(test) or gi >= len (gold):
21	break
22	if test[i].isspace() or test[i] in '":.-?%' or test[i] == u'\uFEFF':
23	i += 1
24	elif gold[gi].isspace() or gold[gi] in '"%' or gold[gi] == u'\uFEFF':
25	gi += 1
26	elif test[i] == ',' or gold[gi] in ',:.-?':
27	if test[i] == gold[gi]:
28	no_correct += 1
29	if test[i] == ',' and not gold[gi] == '.':
30	no_found += 1
31	i += 1
32	if gold[gi] == ',':
33	no_gold += 1
34	if verbose and test[i] != gold[gi]: print "missing: "+gold[gi-verbose_window:gi+verbose_window]
35	gi += 1
36	elif gold[gi] in ':.-?':
37	gi += 1
38	else: # check that chars are same
39	if test[i].lower() == gold[gi].lower():
40	i += 1
41	gi += 1
42	else:
43	print
44	print test[i:i+100].encode('utf8').replace('\n', '#')
45	print
46	print gold[gi:gi+100].encode('utf8').replace('\n', '#')
47	print
48	raise RuntimeError('mismatch')
49
50	print '# found commas:', no_found
51	print '# gold commas:', no_gold
52	print '# correctly found commas:', no_correct
53
54	def percent(n):
55	return ('%.1f %%' % (n*100))
56
57	if no_found and no_gold and no_correct:
58	prec = float(no_correct)/no_found
59	rec = float(no_correct)/no_gold
60	print
61	print 'precision:', percent(prec)
62	print 'recall:', percent(rec)
63	print 'F:', percent(2 * prec * rec / (prec + rec))

Context Navigation

en/AdvancedNlpCourse2018/AutomaticCorrection: evalpunct_robust.py

Download in other formats: