1 | #!/usr/bin/python |
---|
2 | # -*- coding: utf-8 -*- |
---|
3 | |
---|
4 | import sys |
---|
5 | import re |
---|
6 | |
---|
7 | if len(sys.argv)<3: |
---|
8 | print("Usage get_unknown.py [train_data] [test_data]") |
---|
9 | exit(0) |
---|
10 | |
---|
11 | train_data = sys.argv[1] |
---|
12 | test_data = sys.argv[2] |
---|
13 | |
---|
14 | entities = [] |
---|
15 | |
---|
16 | with open(train_data) as f: |
---|
17 | for line in f: |
---|
18 | if '\t' in line: |
---|
19 | token, annotation = line.strip().split('\t',1) |
---|
20 | if annotation != 'O': |
---|
21 | entities.append(token) |
---|
22 | |
---|
23 | #print(entities) |
---|
24 | |
---|
25 | document = [] |
---|
26 | contains_known_ent = False |
---|
27 | with open(test_data) as f: |
---|
28 | for line in f: |
---|
29 | if not line.strip(): |
---|
30 | if not contains_known_ent: |
---|
31 | for token, annotation in document: |
---|
32 | print('{}\t{}'.format(token, annotation)) |
---|
33 | print('') |
---|
34 | contains_known_ent = False |
---|
35 | document = [] |
---|
36 | if '\t' in line: |
---|
37 | token, annotation = line.strip().split('\t',1) |
---|
38 | document.append((token,annotation)) |
---|
39 | if annotation != 'O' and token in entities: |
---|
40 | contains_known_ent = True |
---|