1 | #!/usr/bin/python |
---|
2 | # -*- coding: utf-8 -*- |
---|
3 | |
---|
4 | import sys |
---|
5 | import re |
---|
6 | |
---|
7 | if len(sys.argv)<2: |
---|
8 | print("Usage convert_cnec_stanford.py [corpus_in_cnec_xml]") |
---|
9 | exit(0) |
---|
10 | |
---|
11 | filename = sys.argv[1] |
---|
12 | |
---|
13 | ENT = {"P":"PER","pp":"PER","p_":"PER","pf":"PER","ps":"PER","pb":"PER","pm":"PER","pc":"PER","pd":"PER", |
---|
14 | "ia":"ORG","if":"ORG","io":"ORG","ic":"ORG","i_":"ORG", |
---|
15 | "G":"LOC","gc":"LOC","gh":"LOC","gr":"LOC","gq":"LOC","gl":"LOC","gu":"LOC","gt":"LOC","gs":"LOC","g_":"LOC", |
---|
16 | "A":"LOC","a_":"LOC","ah":"LOC","az":"LOC", "lower": "O", "cap": "O"} |
---|
17 | |
---|
18 | ne_type_re = re.compile(r'<ne type="([^"]*)">', re.U) |
---|
19 | markup_split_re = re.compile(r'(<[^>]*>)|(\s*)', re.U) |
---|
20 | |
---|
21 | with open(filename) as f: |
---|
22 | for line in f: |
---|
23 | line = line.strip() |
---|
24 | new_line = u"" |
---|
25 | inside = 0 |
---|
26 | markup = "O" |
---|
27 | for k, token in enumerate(markup_split_re.split(line)): |
---|
28 | if not token or not token.strip(): |
---|
29 | continue |
---|
30 | #print("token",token,inside,markup) |
---|
31 | ne_type = ne_type_re.findall(token) |
---|
32 | if ne_type: |
---|
33 | inside += 1 |
---|
34 | if inside== 1: |
---|
35 | if ne_type[0] in ENT.keys(): |
---|
36 | markup = ENT[ne_type[0]] |
---|
37 | else: |
---|
38 | markup = "OTHER" |
---|
39 | elif token == "</ne>": |
---|
40 | inside -= 1 |
---|
41 | if inside == 0: |
---|
42 | markup = "O" |
---|
43 | elif not token.startswith("<") or not token.endswith(">"): |
---|
44 | new_line += u"{}\t{}\n".format(token, markup) |
---|
45 | print(new_line) |
---|