#!/nlp/projekty/metatrans2/public_html/python3/bin/python3 from collections import defaultdict from datetime import datetime from itertools import chain import json import logging import os from subprocess import Popen, PIPE, DEVNULL import sys from ijacek import TopDownChartParser INPUT_TYPES = ('plaintext', 'vertical') def web(): input_type = sys.argv[1] if (len(sys.argv) >= 2 and sys.argv[1] in INPUT_TYPES) else 'plaintext' text = sys.stdin.read() pipes = (sys.stdout, sys.stderr) file_name = '{:%m-%d-%H-%M-%S}'.format(datetime.now()) path = os.path.join('cache', file_name) output_paths = {} try: os.mkdir(path) path = os.path.join(path, file_name) except: # PermissionError, FileNotFoundError # TODO: logging.error a path_log # TODO: a vůbec, tohle je kritická chyba, ať se pěkně hlasitě ohlásí # a víc se nedělá! print('mkdir({}) failed'.format(path), file=sys.stderr) path_desamb = path + '.vert' path_vertical_phrases = path + '.xml' path_ijacek_log = path + '.xml.log' path_dot = path + '-{}-{}.dot' # path_graphviz_log = path + '.dot.log' if input_type == 'plaintext': text = tokenize_tag_desambiguate(text) print(text, file=open(path_desamb, 'w')) output_paths.update(desamb=path_desamb) sys.stdout = open(path_vertical_phrases, 'w') sys.stderr = open(path_ijacek_log, 'w') output_paths.update(vertical_phrases=path_vertical_phrases, ijacek_log=path_ijacek_log) ijacek = TopDownChartParser(argv=[None, '--vertical-phrases']) parse(ijacek, text) dots = defaultdict(list) for sentence, edges in ijacek.successful_edges.items(): for index, analysis in enumerate(edges): dot = path_dot.format(sentence, index) sys.stdout = open(dot, 'w') ijacek.print_dot(analysis) dots[sentence].append(dot) output_paths.update(phrasal_trees=dots) proc = Popen('dot -Tsvg -O ' + ' '.join(chain.from_iterable(dots.values())), stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL, shell=True) sys.stdout, sys.stderr = pipes sys.stdout.write(json.dumps(output_paths)) sys.stdout.flush() def tokenize_tag_desambiguate(plaintext): proc = Popen('/corpora/programy/unitok.py --language=czech ' '--encoding=ISO-8859-2 | /corpora/programy/desamb.sh', stdin=PIPE, stdout=PIPE, stderr=open('/dev/null', 'w'), shell=True) tagged, _ = proc.communicate(plaintext.encode('ISO-8859-2')) return tagged.decode('ISO-8859-2') def parse(ijacek, vertical): print('\n' # '\n' '') ijacek.parse_from_vertical(lines=vertical.split('\n')) print('') if __name__ == '__main__': web()