from collections import defaultdict from shlex import shlex from attrs import Attrs from log import log class Rule: """ Should be declared and kept as immutable so it can be hashed (including its subclass, Edge) """ def __init__(self, left, right): self.left = left self.right = right self.epsilon_rules = list(self._process_optional_tokens()) self.attr_matches = self._process_matches() def __str__(self): right = [str(Attrs(attrs)) for attrs in self.right] if 'point' in self.__dict__: right.insert(self.point, '•') if len(right) == 1: # ε-rule with • now inserted right.insert(0, 'ε') # or just right = list('ε•') elif not right: right = ['ε'] span = ('{}–{} '.format(self.begin, self.end) if 'begin' in self.__dict__ else '') return span + str(Attrs(self.left)) + ' → ' + ' '.join(right) def _process_matches(self): attr_matches = defaultdict(list) # → OrderedDict, abych nemusel řadit for index, token in enumerate([self.left] + list(self.right), -1): for attr, value in token.items(): if value is None: attr_matches[attr].append(index) for attr, match_on_indexes in attr_matches.items(): if len(match_on_indexes) == 1: raise ValueError('No other rule token to match "%s" on' % attr) return attr_matches @classmethod def from_string(cls, rule): left, right = rule.split(' → ') left = next(cls.read_rule(left)) right = list(cls.read_rule(right)) return cls(left, right) @staticmethod def tokenize_rule(rule): parser = shlex(rule) # can’t handle Czech without parentheses while True: token = parser.get_token() if not token: yield # None as another signal of termination break yield token @staticmethod def read_rule(rule): """ Return a list of symbols, i.e. (non-)terminals from a string (which has already been split to the left/right side) """ attrs = None # not None → inside the token (= the whole "attribute", # the group inside parentheses) attr = '' equal_sign = False for token in Rule.tokenize_rule(rule): if token == '(': if attrs is None: attrs = {} continue elif attrs is not None and ( 'phrase' not in attrs or attrs['phrase']): # nebyl „?“; i když byl, ať je společný code-flow yield attrs attrs = {} continue else: # za výjimečných okolností (a=( b=1)? raise ValueError('Unexpected "("') elif token is None: if attrs is not None: yield attrs attrs = None break elif attrs is None: raise ValueError('Outside token: ' + token) elif token == ')': if equal_sign: raise ValueError('Unterminated attribute ' + attr) elif attr: attrs[attr] = None if 'phrase' in attrs and attrs['phrase'] is None: del attrs['phrase'] # může bejt „?“ attr = '' equal_sign = False continue elif token == '?': attrs['optional'] = True continue # yield attrs # attrs = None if not attr: if token == '=': raise ValueError('Unexpected "="') if not attr and 'phrase' not in attrs: if token.islower(): # c, tag, word… attrs['phrase'] = None # not the phrase head name attr = token else: attrs['phrase'] = token continue if not attr: attr = token elif token == '=': equal_sign = True elif equal_sign: quote = token[0] if token[0] in '"\'' else None if quote and token[-1] != quote: raise ValueError('Not properly quoted: ' + attr) if quote: token = token[1:-1] attrs[attr] = token attr = '' equal_sign = False else: attrs[attr] = None attr = token if attrs is not None: raise ValueError('Rule not terminated by ")"') def _process_optional_tokens(self): for index, token in enumerate(self.right, 1): if token.get('optional', False): if 'phrase' not in token: new_token = dict(token) del new_token['optional'] # token['word'] = None token.pop('word', None) token.pop('lemma', None) token.pop('tag', None) token['phrase'] = self.left['phrase'] + str(index) new_head = dict(token) del new_head['optional'] yield Rule(new_head, [new_token]) yield Rule(dict(phrase=token['phrase']), [])