#!/usr/bin/python # -*- coding: utf-8 -*- import sys import re if len(sys.argv)<3: print("Usage get_unknown.py [train_data] [test_data]") exit(0) train_data = sys.argv[1] test_data = sys.argv[2] entities = [] with open(train_data) as f: for line in f: if '\t' in line: token, annotation = line.strip().split('\t',1) if annotation != 'O': entities.append(token) #print(entities) document = [] contains_known_ent = False with open(test_data) as f: for line in f: if not line.strip(): if not contains_known_ent: for token, annotation in document: print('{}\t{}'.format(token, annotation)) print('') contains_known_ent = False document = [] if '\t' in line: token, annotation = line.strip().split('\t',1) document.append((token,annotation)) if annotation != 'O' and token in entities: contains_known_ent = True