#!/usr/bin/env python3 #!/nlp/projekty/metatrans2/public_html/python3/bin/python3 from contextlib import contextmanager # import doctest import os import sys from edge import Edge from log import log, set_log_level, DEBUG from symbols import Attrs, EdgeSymbol, RuleSymbol, Symbol, Token from rule import Rule from ijacek import TopDownChartParser def assert_equal(computed, expected): if computed is not expected: if not (isinstance(computed, str) or isinstance(expected, str)): log.warning('%s is not identical to %s', computed, expected) if computed != expected: raise AssertionError('{} != {}'.format(computed, expected)) # log.info('OK %s == %s', computed, expected) @contextmanager def assert_raises(error): try: yield except error as e: log.debug('correctly raised %s: %s', error.__name__, e) else: raise AssertionError('{} was not triggered'.format(error.__name__)) def test_attrs(): attrs = Attrs(k='1') assert_equal(attrs['k'], '1') assert_equal(attrs.shared_attrs, False) assert_equal(str(attrs), 'k=1') attrs.parse_tag('c2') assert_equal(attrs.shared_attrs, False) assert_equal(attrs['c'], '2') assert_equal(str(attrs), 'k=1 c=2') copied = Attrs(attrs) assert_equal(copied.shared_attrs, True) assert_equal(copied['k'], '1') copied['k'] = '2' assert_equal(copied.shared_attrs, False) other = Attrs(k='1') # assert_equal(other <= attrs, True) assert_equal(other['k'], '1') assert_equal(len(other.attrs_nonempty), 1) assert_equal(other.pop('k'), '1') with assert_raises(KeyError): assert_equal(other['k'], None) assert_equal(len(other.attrs_nonempty), 0) def test_token(TokenClass): # TODO: Token si length nastaví sám TOKEN = '(word=krásného lemma=krásný k=2 g=I n=S c=2 1–2)' token = TokenClass(word='krásného', lemma='krásný', tag='k2gInSc2', begin=1, mwe=False, trailing_whitespace=True, length=1) assert_equal(str(token), TOKEN) token = TokenClass(k=1, word='strom', begin=2, length=1) assert_equal(str(token), '(word=strom k=1 2–3)') unambiguous = TokenClass(g='I', lemma='košatý', tag='k2c1', begin=1, length=1) assert_equal(str(unambiguous), '(lemma=košatý k=2 g=I c=1 1–2)') ambiguous = TokenClass(word='strom', lemma='strom', tag='k1gInSc1,k1gInSc4', begin=1, length=1) assert_equal(str(ambiguous), '(word=strom lemma=strom k=1 g=I n=S / c=1 / c=4 1–2)') def test_rulesymbol(): # staré, asi duplicitní assert_equal(str(Symbol('()')), '()') assert_equal(str(Symbol('(X)')), '(X)') assert_equal(str(Symbol('(c=1 k=1)')), '(k=1 c=1)') symbol = RuleSymbol(phrase='NP', k=True) assert_equal(symbol.phrase, 'NP') assert_equal(symbol['k'], True) assert_equal(len(symbol.attrs_nonempty), 1) assert_equal(symbol.attrs_nonempty[0].value, True) assert_equal(symbol.attrs_nonempty[0].attr_id, 2) # pokud to nebudu měnit assert_equal(symbol.attrs_nonempty[0], symbol.attrs_fixed[2]) other = RuleSymbol(phrase='NP') assert_equal(other <= symbol, True) def test_edgesymbol(): rule_symbol = RuleSymbol(phrase='NP', c=True) edge_symbol = EdgeSymbol(rule_symbol=rule_symbol) assert_equal(edge_symbol.rule_symbol.phrase, 'NP') assert_equal(edge_symbol['c'], True) assert_equal(len(edge_symbol.attrs_nonempty), 1) edge_symbol['c'] = '4' assert_equal(edge_symbol['c'], '4') assert_equal(len(edge_symbol.attrs_nonempty), 1) # TODO: ještě porovnání s tím terminálem ( == nebo <= ) terminal = Token(k=1, word='strom', c='4', begin=2) assert_equal(str(terminal), '(word=strom k=1 c=4 2–3)') extended_edge_symbol = EdgeSymbol(edge_symbol, edge=terminal) assert_equal(str(extended_edge_symbol), '(NP word=strom k=1 c=4 2–3)') def test_rule(): assert_equal(str(Rule('(X) → ε')), '(X) → ε') def test_edge(): rule = Rule('(X) → ε') epsilon_edge = Edge(predicted_by_rule=rule) assert_equal(epsilon_edge.closed, True) assert_equal(epsilon_edge.current_symbol, None) assert_equal(epsilon_edge.rule, rule) assert_equal(format(epsilon_edge, 's'), '(X 0–0) → ε •') def test_match_terminal(): terminal = Symbol('(word=test)') # speciálně to znamená, že ke striktnímu pravidlu, které nějaký atribut # vyžaduje, je někdy dobré doplnit i permisivní variantu, protože u prvního # shoda neprojde missing_attribute = Symbol(lemma='word=test is missing') assert_equal(terminal.match_terminal(missing_attribute), False) equal_terminal = Symbol(word='test') assert_equal(terminal.match_terminal(equal_terminal), True) broader_terminal = Symbol(word='test', lemma='extra') assert_equal(terminal.match_terminal(broader_terminal), True) facultative_attribute = Symbol('(g)') has_attr = Symbol(word='test', g='extra') no_attr = Symbol(word='test') assert_equal(facultative_attribute.match_terminal(has_attr), True) assert_equal(facultative_attribute.match_terminal(no_attr), True) nonterminal = Symbol('(NP c=1)') terminal = Symbol(c='1') assert_equal(nonterminal.match_terminal(terminal), False) def test_extend_using(): # '(INFINITIVE) → (k=5 m=F) (CLITICS) (CONSTITUENTS)')) rule_open_edge = Rule('(DEP_CLAUSE type) → (SUBORDINATOR type) (k=5 head)') open_edge = Edge(predicted_by_rule=rule_open_edge) assert_equal(str(open_edge), '(DEP_CLAUSE type 0–0) → • (SUBORDINATOR type) (head k=5)') closed_edge = Edge(predicted_by_rule=Rule( '(SUBORDINATOR type=place) → (lemma=odkud)')) closed_edge.length = 1 assert_equal(closed_edge.left['type'], 'place') extended_edge = Edge(extended_from=open_edge, extended_using=closed_edge, how_created='test!') assert_equal(str(extended_edge), '(DEP_CLAUSE type=place 0–1) → (SUBORDINATOR type=place 0–1) • (head k=5)') assert_equal(extended_edge.how_created, 'test!') assert_equal(extended_edge.rule, rule_open_edge) assert_equal(extended_edge.extended_from, open_edge) assert_equal(extended_edge[0].edge, closed_edge) wrong_phrase = Edge(predicted_by_rule=Rule('(NP) → (word=zlo)')) assert_equal(open_edge.extended_by(wrong_phrase), False) # identitical unless modified assert_equal(open_edge[1], extended_edge[1]) assert_equal(open_edge[0] == extended_edge[0], False) def test_read_terminal(): rule_open_edge = Rule('(INFINITIVE) → (k=5 m=F) (CLITICS) (CONSTITUENTS)') open_edge = Edge(predicted_by_rule=rule_open_edge) assert_equal(str(open_edge), '(INFINITIVE 0–0) → • (k=5 m=F) (CLITICS) (CONSTITUENTS)') matching_tokens = [ Symbol('(k=5 m=F word=povést) '), ] extended_edge = Edge(extended_from=open_edge, tokens=matching_tokens) assert_equal( str(extended_edge), '(INFINITIVE 0–1) → (word=povést k=5 m=F 0–1) • (CLITICS) (CONSTITUENTS)') assert_equal(open_edge[1], extended_edge[1]) assert_equal(open_edge[0] == extended_edge[0], False) if __name__ == '__main__': if '--debug' in sys.argv: set_log_level(DEBUG) test_attrs() test_token(Symbol) test_token(Token) test_rulesymbol() test_edgesymbol() test_rule() test_match_terminal() test_edge() test_extend_using() test_read_terminal() top_down = TopDownChartParser(argv=sys.argv) tokens = [ Symbol(tag='k2gInSc1', word='krásný'), Symbol(tag='k2gInSc1', word='košatý'), Symbol(tag='k1', word='strom', g='I', n='S', c='1'), Symbol(word=','), Symbol(k='7', word='na'), Symbol(tag='k3', lemma='který', g='I', n='S'), Symbol(tag='k6', word='silně'), Symbol(tag='k5', word='foukalo', e='A', a='I', m='A', g='N', n='S'), Symbol(tag='k3', word='se'), Symbol(tag='k5eNaPmAgInS', word='nevyvrátil'), ] top_down.parse_and_evaluate(tokens) tokens = [ # ?košatý krásný strom (hodnoticí slovo by mělo být před upřesňujícím) Symbol(tag='k2', word='krásný', g='I', n='S', c='4'), Symbol(tag='k2', word='košatý', g='I', n='S', c='4'), Symbol(tag='k1', word='strom', g='I', n='S', c='1'), ] top_down.parse_and_evaluate(tokens, 'NP') tokens = ( 'Líbí líbit k5eAaImIp3nS', 'se sebe k3xPyFc4', 'mi já k3xPp1nSc3', 'to ten k3xDgNnSc1', ', , kIx,', 'co co k3yQnSc4', # cos? 'jsi být k5eAaImIp2nS', 'napsal napsat k5eAaPmAgMnS', # '. . kIx.', ) top_down.parse_from_vertical(tokens) tokens = [ 'strom strom k1gInSc1,k1gInSc4', 'života život k1gInSc2', ] top_down.parse_from_vertical(tokens, root='NP') # 0–2 (NP g=I n=S c=1) → (NP g=I n=S c=1) (NP g=I n=S c=2) • tokens = ( Symbol('(word=Zkouším lemma=zkoušet k=5 e=A n=S p=1 a=I m=I)'), Symbol('(word=parser lemma=parser k=1 g=I n=S c=1)'), Symbol('(word=a lemma=a k=8 x=C)'), Symbol('(word=daří lemma=dařit k=5 e=A n=S p=3 a=I m=I)'), Symbol('(word=se lemma=sebe tag=k3xPyFc4)'), Symbol('(word=! lemma=! k=I x=.)'), ) top_down.parse_and_evaluate(tokens) tokens = ''' Při při k7c6 střetech střet k1gInPc6 stoupenců stoupenec k1gMnPc2 Ruska Rusko k1gNnSc2 s s k7c7 ukrajinskými ukrajinský k2eAgInPc7d1 radikály radikál k1gInPc7 zemřeli zemřít k5eAaPmAgMnP v v k7c6 Charkově Charkov k1gInSc6 v v k7c6 noci noc k1gFnSc6 na na k7c4 sobotu sobota k1gFnSc4 dva dva k4xCgMnPc1 lidé člověk k1gMnPc1 . . kIx. '''.split('\n') top_down.parse_from_vertical(tokens) # TODO: automaticky rozpoznat, že jsme na (barevné) konzoli a zapnout pager def pipepager(text, cmd='less'): """Page through text by feeding it to another program.""" pipe = os.popen(cmd, 'w') try: pipe.write(text) pipe.close() except IOError: pass # Ignore broken pipes caused by quitting the pager program.