1 | #!/usr/bin/python |
---|
2 | |
---|
3 | # ACHTUNG loads both files into memory |
---|
4 | |
---|
5 | verbose = False |
---|
6 | #verbose = True |
---|
7 | |
---|
8 | verbose_window = 10 # characters left+right |
---|
9 | |
---|
10 | import sys |
---|
11 | |
---|
12 | gold = open(sys.argv[1]).read().decode('utf8') |
---|
13 | test = open(sys.argv[2]).read().decode('utf8') |
---|
14 | |
---|
15 | no_found, no_gold, no_correct = 0, 0, 0 |
---|
16 | i, gi = 0, 0 |
---|
17 | |
---|
18 | while True: |
---|
19 | # if i % 10000 == 0: print i |
---|
20 | if i >= len(test) or gi >= len (gold): |
---|
21 | break |
---|
22 | if test[i].isspace() or test[i] in '":.-?%' or test[i] == u'\uFEFF': |
---|
23 | i += 1 |
---|
24 | elif gold[gi].isspace() or gold[gi] in '"%' or gold[gi] == u'\uFEFF': |
---|
25 | gi += 1 |
---|
26 | elif test[i] == ',' or gold[gi] in ',:.-?': |
---|
27 | if test[i] == gold[gi]: |
---|
28 | no_correct += 1 |
---|
29 | if test[i] == ',' and not gold[gi] == '.': |
---|
30 | no_found += 1 |
---|
31 | i += 1 |
---|
32 | if gold[gi] == ',': |
---|
33 | no_gold += 1 |
---|
34 | if verbose and test[i] != gold[gi]: print "missing: "+gold[gi-verbose_window:gi+verbose_window] |
---|
35 | gi += 1 |
---|
36 | elif gold[gi] in ':.-?': |
---|
37 | gi += 1 |
---|
38 | else: # check that chars are same |
---|
39 | if test[i].lower() == gold[gi].lower(): |
---|
40 | i += 1 |
---|
41 | gi += 1 |
---|
42 | else: |
---|
43 | print |
---|
44 | print test[i:i+100].encode('utf8').replace('\n', '#') |
---|
45 | print |
---|
46 | print gold[gi:gi+100].encode('utf8').replace('\n', '#') |
---|
47 | print |
---|
48 | raise RuntimeError('mismatch') |
---|
49 | |
---|
50 | print '# found commas:', no_found |
---|
51 | print '# gold commas:', no_gold |
---|
52 | print '# correctly found commas:', no_correct |
---|
53 | |
---|
54 | def percent(n): |
---|
55 | return ('%.1f %%' % (n*100)) |
---|
56 | |
---|
57 | if no_found and no_gold and no_correct: |
---|
58 | prec = float(no_correct)/no_found |
---|
59 | rec = float(no_correct)/no_gold |
---|
60 | print |
---|
61 | print 'precision:', percent(prec) |
---|
62 | print 'recall:', percent(rec) |
---|
63 | print 'F:', percent(2 * prec * rec / (prec + rec)) |
---|