#!/usr/bin/python # -*- coding: utf-8 -*- import sys import re if len(sys.argv)<2: print("Usage convert_cnec_stanford.py [corpus_in_cnec_xml]") exit(0) filename = sys.argv[1] ENT = {"P":"PER","pp":"PER","p_":"PER","pf":"PER","ps":"PER","pb":"PER","pm":"PER","pc":"PER","pd":"PER", "ia":"ORG","if":"ORG","io":"ORG","ic":"ORG","i_":"ORG", "G":"LOC","gc":"LOC","gh":"LOC","gr":"LOC","gq":"LOC","gl":"LOC","gu":"LOC","gt":"LOC","gs":"LOC","g_":"LOC", "A":"LOC","a_":"LOC","ah":"LOC","az":"LOC", "lower": "O", "cap": "O"} ne_type_re = re.compile(r'', re.U) markup_split_re = re.compile(r'(<[^>]*>)|(\s*)', re.U) with open(filename) as f: for line in f: line = line.strip() new_line = u"" inside = 0 markup = "O" for k, token in enumerate(markup_split_re.split(line)): if not token or not token.strip(): continue #print("token",token,inside,markup) ne_type = ne_type_re.findall(token) if ne_type: inside += 1 if inside== 1: if ne_type[0] in ENT.keys(): markup = ENT[ne_type[0]] else: markup = "OTHER" elif token == "": inside -= 1 if inside == 0: markup = "O" elif not token.startswith("<") or not token.endswith(">"): new_line += u"{}\t{}\n".format(token, markup) print(new_line)