#!/nlp/projekty/metatrans2/public_html/python3/bin/python3
from collections import defaultdict
from datetime import datetime
from itertools import chain
import json
import logging
import os
from subprocess import Popen, PIPE, DEVNULL
import sys
from ijacek import TopDownChartParser
INPUT_TYPES = ('plaintext', 'vertical')
def web():
input_type = sys.argv[1] if (len(sys.argv) >= 2 and
sys.argv[1] in INPUT_TYPES) else 'plaintext'
text = sys.stdin.read()
pipes = (sys.stdout, sys.stderr)
file_name = '{:%m-%d-%H-%M-%S}'.format(datetime.now())
path = os.path.join('cache', file_name)
output_paths = {}
try:
os.mkdir(path)
path = os.path.join(path, file_name)
except: # PermissionError, FileNotFoundError
# TODO: logging.error a path_log
# TODO: a vůbec, tohle je kritická chyba, ať se pěkně hlasitě ohlásí
# a víc se nedělá!
print('mkdir({}) failed'.format(path), file=sys.stderr)
path_desamb = path + '.vert'
path_vertical_phrases = path + '.xml'
path_ijacek_log = path + '.xml.log'
path_dot = path + '-{}-{}.dot'
# path_graphviz_log = path + '.dot.log'
if input_type == 'plaintext':
text = tokenize_tag_desambiguate(text)
print(text, file=open(path_desamb, 'w'))
output_paths.update(desamb=path_desamb)
sys.stdout = open(path_vertical_phrases, 'w')
sys.stderr = open(path_ijacek_log, 'w')
output_paths.update(vertical_phrases=path_vertical_phrases,
ijacek_log=path_ijacek_log)
ijacek = TopDownChartParser(argv=[None, '--vertical-phrases'])
parse(ijacek, text)
dots = defaultdict(list)
for sentence, edges in ijacek.successful_edges.items():
for index, analysis in enumerate(edges):
dot = path_dot.format(sentence, index)
sys.stdout = open(dot, 'w')
ijacek.print_dot(analysis)
dots[sentence].append(dot)
output_paths.update(phrasal_trees=dots)
proc = Popen('dot -Tsvg -O ' + ' '.join(chain.from_iterable(dots.values())),
stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL, shell=True)
sys.stdout, sys.stderr = pipes
sys.stdout.write(json.dumps(output_paths))
sys.stdout.flush()
def tokenize_tag_desambiguate(plaintext):
proc = Popen('/corpora/programy/unitok.py --language=czech '
'--encoding=ISO-8859-2 | /corpora/programy/desamb.sh',
stdin=PIPE, stdout=PIPE, stderr=open('/dev/null', 'w'),
shell=True)
tagged, _ = proc.communicate(plaintext.encode('ISO-8859-2'))
return tagged.decode('ISO-8859-2')
def parse(ijacek, vertical):
print('\n'
# '\n'
'')
ijacek.parse_from_vertical(lines=vertical.split('\n'))
print('')
if __name__ == '__main__':
web()