from collections import defaultdict from itertools import zip_longest import re from shlex import shlex ATTRS = ['word', 'lemma'] + list('kegncpamdxytzw~') ATTR_MAP = {attr: attr_id for (attr_id, attr) in enumerate(ATTRS)} ATTR_IDS = list(range(len(ATTRS))) class XmlTag: def __init__(self, lines=None, name=None, closing=None, empty=None, attributes=None, indentation_level=0): self.lines = lines self.name = name self.closing = closing self.empty = empty self.attributes = attributes or [] self.indentation_level = indentation_level @property def opening(self): return not self.closing def append(self, attr, value): self.attributes.append((attr, value)) def __contains__(self, key): return key in (attr for attr, val in self.attributes) def clear(self): self.attributes.clear() def __getitem__(self, key): """Get the last value attr.""" for i in range(1, len(self.attributes) + 1): # enumerate_reversed attr, value = self.attributes[-i] if key == attr: return value else: return None def __setitem__(self, attr, value): """Set the value to the last occurence of attr, or append a new one.""" for i in range(1, len(self.attributes) + 1): # enumerate_reversed old_attr, _ = self.attributes[-i] if attr == old_attr: self.attributes[-i] = (attr, value) break else: self.attributes.append((attr, value)) def __iter__(self): return iter(self.attributes) def __str__(self): opening = '' if self.opening else '/' empty = '/' if self.empty else '' attrs = ''.join( ' {}="{}"'.format(attr, val) for attr, val in self.attributes) return '{indent}<{opening}{name}{attrs}{empty}>'.format( opening=opening, name=self.name, attrs=attrs, empty=empty, indent=' '*self.indentation_level) def __repr__(self): return 'XmlTag(' + escape(self) + ')' @classmethod def parse(cls, lines): parser = shlex(lines[1:]) # TODO: radši jednou get_token() == '<' tag = cls() attr = None value = None while True: token = parser.get_token() # ValueError("No closing quotation") if not token: break elif tag.closing is None: tag.closing = token == '/' if tag.opening: tag.name = token elif tag.name is None: if token == '>': break tag.name = token elif attr is None: if token == '/': tag.empty = True elif token == '>': break elif token == '=': break # unexpected = else: attr = token value = '=' # expecting = elif tag.empty: if token == '>': break else: break # unexpected / elif value == '=': if token == '=': value = None # expecting a value elif token == '/': tag.append(attr, None) tag.empty = True elif token == '>': tag.append(attr, None) break else: tag.append(attr, None) # got an empty value attr = token # immediately followed by another elif value is None: if (len(token) > 1 and token[0] in '\'"' and token[0] == token[-1]): token = token[1:-1] tag.append(attr, token) attr = None # expecting an attribute value = None return tag def extract_attributes_from_tag(tag): # TODO: also return tag in .ambiguous so matching on tag works # MAYBE: support k1c14 (multiple-valued attributes in a short form) if ',' in tag: tags = tag.split(',') if '' not in tags: return put_ambiguous_attrs_aside(tags) if len(tag) % 2 != 0: raise ValueError('Malformed tag "{}"'.format(tag)) return list(grouper(tag, 2)), [] def put_ambiguous_attrs_aside(tags): # MAYBE: support ambiguous tags of uneven length: c1,c2d1 all_values = defaultdict(set) parsed_tags = [] for tag in tags: if len(tag) % 2 != 0: raise ValueError('Malformed tag "{}"'.format(tag)) attrs = list(grouper(tag, 2)) + [('tag', tag)] for attr, value in attrs: all_values[attr].add(value) parsed_tags.append(attrs) unambiguous = {attr: values.pop() for (attr, values) in all_values.items() if len(values) == 1} ambiguous = [] for attrs in parsed_tags: ambiguous_group = {attr: value for (attr, value) in attrs if attr not in unambiguous} if ambiguous_group: ambiguous.append(ambiguous_group) return unambiguous, ambiguous # copied from oVirt/VDSM, which in turn used # http://docs.python.org/2.6/library/itertools.html?highlight=grouper#recipes def grouper(iterable, n, fillvalue=None): "Collect data into fixed-length chunks or blocks" # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx args = [iter(iterable)] * n return zip_longest(fillvalue=fillvalue, *args) need_escaping = re.compile(r'[()=? \t\n]').search # podle shlex._find_unsafe def escape(s): # začátek podle shlex.quote if not isinstance(s, str): s = str(s) if not s: return "''" elif need_escaping(s) is None: return s elif "'" not in s: return "'" + s + "'" elif '"' not in s: return '"' + s + '"' else: raise ValueError("Can't escape, the string includes both quotes: " + s)