Context Navigation

NamedEntityRecognition: convert_cnec_stanford.py

File convert_cnec_stanford.py, 1.3 KB (added by Zuzana Nevěřilová, 6 years ago)

Line
1	#!/usr/bin/python
2	# -- coding: utf-8 --
3
4	import sys
5	import re
6
7	if len(sys.argv)<2:
8	print("Usage convert_cnec_stanford.py [corpus_in_cnec_xml]")
9	exit(0)
10
11	filename = sys.argv[1]
12
13	ENT = {"P":"PER","pp":"PER","p_":"PER","pf":"PER","ps":"PER","pb":"PER","pm":"PER","pc":"PER","pd":"PER",
14	"ia":"ORG","if":"ORG","io":"ORG","ic":"ORG","i_":"ORG",
15	"G":"LOC","gc":"LOC","gh":"LOC","gr":"LOC","gq":"LOC","gl":"LOC","gu":"LOC","gt":"LOC","gs":"LOC","g_":"LOC",
16	"A":"LOC","a_":"LOC","ah":"LOC","az":"LOC", "lower": "O", "cap": "O"}
17
18	ne_type_re = re.compile(r'<ne type="([^"]*)">', re.U)
19	markup_split_re = re.compile(r'(<[^>]>)\|(\s)', re.U)
20
21	with open(filename) as f:
22	for line in f:
23	line = line.strip()
24	new_line = u""
25	inside = 0
26	markup = "O"
27	for k, token in enumerate(markup_split_re.split(line)):
28	if not token or not token.strip():
29	continue
30	#print("token",token,inside,markup)
31	ne_type = ne_type_re.findall(token)
32	if ne_type:
33	inside += 1
34	if inside== 1:
35	if ne_type[0] in ENT.keys():
36	markup = ENT[ne_type[0]]
37	else:
38	markup = "OTHER"
39	elif token == "</ne>":
40	inside -= 1
41	if inside == 0:
42	markup = "O"
43	elif not token.startswith("<") or not token.endswith(">"):
44	new_line += u"{}\t{}\n".format(token, markup)
45	print(new_line)