Context Navigation

NamedEntityRecognition: get_unknown.py

File get_unknown.py, 869 bytes (added by Zuzana Nevěřilová, 6 years ago)

Line
1	#!/usr/bin/python
2	# -- coding: utf-8 --
3
4	import sys
5	import re
6
7	if len(sys.argv)<3:
8	print("Usage get_unknown.py [train_data] [test_data]")
9	exit(0)
10
11	train_data = sys.argv[1]
12	test_data = sys.argv[2]
13
14	entities = []
15
16	with open(train_data) as f:
17	for line in f:
18	if '\t' in line:
19	token, annotation = line.strip().split('\t',1)
20	if annotation != 'O':
21	entities.append(token)
22
23	#print(entities)
24
25	document = []
26	contains_known_ent = False
27	with open(test_data) as f:
28	for line in f:
29	if not line.strip():
30	if not contains_known_ent:
31	for token, annotation in document:
32	print('{}\t{}'.format(token, annotation))
33	print('')
34	contains_known_ent = False
35	document = []
36	if '\t' in line:
37	token, annotation = line.strip().split('\t',1)
38	document.append((token,annotation))
39	if annotation != 'O' and token in entities:
40	contains_known_ent = True