#!/usr/bin/python3 ''' CLI and web interface to a “chunker” and Functional Sentence Perspective “tagger” for Czech As an input, the following is accepted: – plain text (with no boilerplate) – vertical text with columns word, lemma and tag as given by desamb.sh (and XML-like tags ~~and~~ on separate lines, delimiting sentences/utterances) – vertical text with two additional columns node_id and parent_id,dep_label (the label is optional, comma is not) as given by set.py --sconll – vertical text with additional XML-like tags as given by the chunker: and , , enclitic, conjunction, … wrapping sentence constituents of top-level clauses in a complex sentence; they may also have attributes such as 'type', 'PNE', … The format of the input shall be given as the only parameter to the program, e.g. fsp.py plaintext (others: vertical, sconll, xml) The output is always given in the format constisting of vertical text including all five columns and (on separate lines) tags as described before. The input in one of the first three formats is passed to respective tools to complete information needed for further processing. These tools currently include: – unitok.py (tokenizing free text to tokens on separate lines) – desamb.sh (splitting sentences on ending punctuation; morphological tagging) – set.py (full syntactic analysis using our modified grammar) – [chunker.py] (extracting constituents from top-level clauses and determining their type and various properties important for word order-based FSP tagging) With clause constituents having been found word-order positions they belong to are heuristically labelled (they appear as attribute 'position' in XML tags). Basic FSP tagging is then performed, taking constituent type and position into account. The result is, again, stored as XML attributes, e.g. 'rheme-proper'. For debugging, logging output from the tools is saved to files in a subdirectory; this includes visualization of dependency tress if SET is run. TODO: without the parameter, act as a simple filter (plain FSP tagger) ''' from datetime import datetime import json import os from subprocess import Popen, PIPE # , DEVNULL import sys from sentence import sentences_from_sconll, sentences_from_chunks def split_sentences(lines): '''Delimit sentences using full stops – too bad for Mr. Etc.''' for n, line in enumerate(lines): if line == '.\t.\tkIx.' and lines[n+1] != '': lines[n] += '\n\n' return '\n'.join(lines) def export_trees(lines, path_dep): import io from vert2dep import vert2dep global output_all dep = io.StringIO() vert2dep(lines, dep) if output_all: print(dep.getvalue(), file=open(path_dep, 'w')) proc = Popen(('/usr/bin/python2', '/home/xsvobo15/set/TreeViewer/TreeExporter.py', path_svg), stdin=PIPE, stdout=open('/dev/null', 'w'), stderr=PIPE, universal_newlines=True) svg, svg_err = proc.communicate(dep.getvalue()) # print(svg_err, file=sys.stderr) PATH_SET = 'set/set.py' # '/home/xsvobo15/set/set.py' INPUT_TYPES = ('plaintext', 'vertical', 'sconll', 'xml') if __name__ == '__main__': input_type = 'plaintext' output_all = True if len(sys.argv) >= 2 and sys.argv[1] in INPUT_TYPES: input_type = sys.argv[1] output_all = len(sys.argv) < 3 or sys.argv[2] != '-' data = sys.stdin.read() # TODO: rename paths = {} pipes = (sys.stdout, sys.stderr) fileName = '{:%m-%d-%H-%M-%S}'.format(datetime.now()) path = os.path.join('cache', fileName) if output_all: try: os.mkdir(path) path = os.path.join(path, fileName) except: print('mkdir({}) failed'.format(path), file=sys.stderr) path_desamb = path + '.vert' # desamb path_desamb_log = path + '.vert.log' # desamb path_set = path + '.set' # SET path_set_log = path + '.set.log' # SET path_xml = path + '.xml' # chunker path_xml_log = path + '.xml.log' path_fsp = path + '.fsp' # tagger path_fsp_log = path + '.fsp.log' path_dep = path + '.dep' path_svg = path + '-.svg' if input_type == 'plaintext': proc = Popen('/corpora/programy/unitok.py --language=czech ' '--encoding=ISO-8859-2 | /corpora/programy/desamb.sh', stdin=PIPE, stdout=PIPE, stderr=open('/dev/null', 'w'), shell=True) data, err = proc.communicate(data.encode('ISO-8859-2')) data = data.decode('ISO-8859-2') data = split_sentences(data.split('\n')) if output_all: print(data, file=open(path_desamb, 'w')) paths.update(desamb=path_desamb, desamb_log=path_desamb_log) if input_type in ('plaintext', 'vertical'): proc = Popen(('/usr/bin/python2', PATH_SET, '--sconll', '--preserve-xml-tags', '-v', '--grammar=fsp.set'), stdin=PIPE, stdout=PIPE, stderr=open(path_set_log, 'w'), universal_newlines=True) data, err = proc.communicate(data) if output_all: print(data, file=open(path_set, 'w')) paths.update(set=path_set, set_log=path_set_log) lines = data.split('\n') if input_type in ('plaintext', 'vertical', 'sconll'): if output_all: export_trees(lines, path_dep) paths.update(dep=path_dep) sys.stdout = open(path_xml, 'w') sys.stderr = open(path_xml_log, 'w') paths.update(xml=path_xml, xml_log=path_xml_log) sentences = list(sentences_from_sconll(lines)) # we assume the file is in a subdirectory (“cache/[datetime]/“) if output_all: print('\n' '\n' '') for s in sentences: s.find_chunks() if output_all: s.print_vertical(sys.stdout) if output_all: print('') if output_all: sys.stdout = open(path_fsp, 'w') sys.stderr = open(path_fsp_log, 'w') paths.update(fsp=path_fsp, fsp_log=path_fsp_log) if input_type == 'xml': sentences = list(sentences_from_chunks(lines)) print('\n' '\n' '') for s in sentences: if not s.error: s.find_clauses() s.label_positions() s.print_vertical(sys.stdout) print('') if output_all: sys.stdout, sys.stderr = pipes sys.stdout.write(json.dumps(paths)) sys.stdout.flush()