#!/usr/bin/python # ACHTUNG loads both files into memory verbose = False #verbose = True verbose_window = 10 # characters left+right import sys gold = open(sys.argv[1]).read().decode('utf8') test = open(sys.argv[2]).read().decode('utf8') no_found, no_gold, no_correct = 0, 0, 0 i, gi = 0, 0 while True: # if i % 10000 == 0: print i if i >= len(test) or gi >= len (gold): break if test[i].isspace() or test[i] in '":.-?%' or test[i] == u'\uFEFF': i += 1 elif gold[gi].isspace() or gold[gi] in '"%' or gold[gi] == u'\uFEFF': gi += 1 elif test[i] == ',' or gold[gi] in ',:.-?': if test[i] == gold[gi]: no_correct += 1 if test[i] == ',' and not gold[gi] == '.': no_found += 1 i += 1 if gold[gi] == ',': no_gold += 1 if verbose and test[i] != gold[gi]: print "missing: "+gold[gi-verbose_window:gi+verbose_window] gi += 1 elif gold[gi] in ':.-?': gi += 1 else: # check that chars are same if test[i].lower() == gold[gi].lower(): i += 1 gi += 1 else: print print test[i:i+100].encode('utf8').replace('\n', '#') print print gold[gi:gi+100].encode('utf8').replace('\n', '#') print raise RuntimeError('mismatch') print '# found commas:', no_found print '# gold commas:', no_gold print '# correctly found commas:', no_correct def percent(n): return ('%.1f %%' % (n*100)) if no_found and no_gold and no_correct: prec = float(no_correct)/no_found rec = float(no_correct)/no_gold print print 'precision:', percent(prec) print 'recall:', percent(rec) print 'F:', percent(2 * prec * rec / (prec + rec))