From 42f37fe6830d6ef9e9a919f784c33ada588bed44 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 27 Jan 2017 11:33:15 +0100 Subject: [PATCH] first draft of support of enhanced dependencies --- udapi/core/links.py | 102 ++++++++++++++++++++++++++++++++++++++++++++ udapi/core/node.py | 39 ++++++++++++++++- 2 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 udapi/core/links.py diff --git a/udapi/core/links.py b/udapi/core/links.py new file mode 100644 index 00000000..cac15586 --- /dev/null +++ b/udapi/core/links.py @@ -0,0 +1,102 @@ +"""Links is a class for storing a set of links with the same source node.""" +import collections.abc +import logging +import re + +Link = collections.namedtuple('Link', 'node relation') + +class Links(list): + """Links class serves as a `list` with additional methods. + + >>> enhdeps = EnhDeps('4:nsubj|11:nsubj') + >>> for enhdep in enhdeps: + >>> str(enhdep) + '4:nsubj' + '11:nsubj' + >>> enhdeps[0].parent = node_with_ord5 + >>> enhdeps[0].deprel = 'obj' + >>> str(enhdeps) + '5:obj|11:nsubj' + + This class provides access to both + * a structured (list of named tuples) representation and + * a string (serialized) representation of the enhanced depndencies. + + Implementation details: + Unlike `DualDict` + * the structured internal storage is list, not dict + * the string representation is always computed on the fly, it is not stored. + """ + + def __init__(self, src_node, string=None): + self.src_node = src_node + items = [] + if string is not None: + all_nodes = src_node.root.descendants(add_self=1) + for edge_str in string.split('|'): + try: + trg_node_id, relation = edge_str.split(':') + except ValueError as exception: + logging.error("<%s> contains <%s> which does not contain one ':' symbol.", + string, edge_str) + raise exception + # TODO allow `trg_node_id`s like 5.1, /zone#1, bundle/zone#1, bundle#1 + trg_node = all_nodes[int(trg_node_id)] + link = Link(node=trg_node, relation=relation) + items.append(link) + super().__init__(self, items) + + def __str__(self): + serialized = [] + for link in self: + # TODO allow `trg_node_id`s like /zone#1, bundle/zone#1, bundle#1 + serialized.append('%s:%s' % (link.node.ord, link.relation)) + return '|'.join(serialized) if serialized else '_' + + def set_links(self, value): + """Set the edges from a list of tuples or string. + + If the `value` is None or an empty string, it is converted to storing empty list of edges. + If the `value` is a string, it is parsed as in `__init__`. + If the `value` is a list of `Edge` namedtuples its copy is stored. + Other types of `value` raise an `ValueError` exception. + """ + if value is None: + self.clear() + elif isinstance(value, str): + self.clear() + self.__init__(value) + elif isinstance(value, collections.abc.Sequence): + self.clear() + super().__init__(value) + else: + raise ValueError("Unsupported value type " + str(value)) + + def __call__(self, following_only=False, preceding_only=False, relations=None): + """Return a subset of links contained in this list as specified by the args. + + TODO: document args + """ + if not following_only and not preceding_only and relations is None: + return self + links = list(self) + if preceding_only: + links = [l for l in links if l.node.precedes(self.src_node)] + if following_only: + links = [l for l in links if self.src_node.precedes(l.node)] + if relations: + links = [l for l in links if re.match(relations, l.relation)] + return Links(self.src_node, links) + + @property + def nodes(self): + """Return a list of the target nodes (without relations).""" + return [link.node for link in self] + + # TODO make sure backlinks are created and updated + def TODO__setitem__(self, index, new_value): + old_value = self[index] + old_value.node._enh_children = [l for l in old_value.node._enh_children if l != old_value] + if new_value.node._enh_children is None: + new_value.node._enh_children = Links(new_value.node, None) + super().__setitem__(self, index, new_value) diff --git a/udapi/core/node.py b/udapi/core/node.py index 6082c2b9..6a584223 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -6,6 +6,7 @@ from udapi.block.write.textmodetrees import TextModeTrees from udapi.core.dualdict import DualDict from udapi.core.feats import Feats +from udapi.core.links import Links # Pylint complains when we access e.g. node.parent._children or root._descendants # because it does not know that node.parent is the same class (Node) @@ -69,6 +70,8 @@ class Node(object): '_misc', # Any other annotation as udapi.core.dualdict.DualDict object. '_raw_deps', # Enhanced dependencies (head-deprel pairs) in their original CoNLLU format. '_deps', # Deserialized enhanced dependencies in a list of {parent, deprel} dicts. + '_enh_parents', # Enhanced dependencies (head-deprel pairs) as EnhDeps object. + '_enh_children', # Enhanced dependencies (child-deprel pairs) as EnhDeps object. '_feats', # Morphological features as udapi.core.feats.Feats object. '_parent', # Parent node. '_children', # Ord-ordered list of child nodes. @@ -76,7 +79,7 @@ class Node(object): ] def __init__(self, form=None, lemma=None, upos=None, # pylint: disable=too-many-arguments - xpos=None, feats=None, deprel=None, misc=None): + xpos=None, feats=None, deprel=None, enh_parents=None, misc=None): """Create a new node and initialize its attributes using the keyword arguments.""" self.ord = None self.form = form @@ -86,6 +89,10 @@ def __init__(self, form=None, lemma=None, upos=None, # pylint: disable=too-many- self._feats = Feats(string=feats) self.deprel = deprel self._misc = DualDict(string=misc) + self._enh_parents = None + if enh_parents is not None and enh_parents != '_': + self._enh_parents = Links(self, enh_parents) + self._enh_children = None self._raw_deps = '_' self._deps = None self._parent = None @@ -151,6 +158,36 @@ def misc(self): def misc(self, value): self._misc.set_mapping(value) + @property + def enh_parents(self): + """Return a list of (parent, deprel) enhanced dependencies. + + To get just the parent nodes (without deprels) use + `enhanced_parents = node.enh_parents.nodes` + """ + if self._enh_parents is None: + self._enh_parents = Links(self, None) + return self._enh_parents + + @enh_parents.setter + def enh_parents(self, value): + if self._enh_parents is None: + if value is not None and value != '_': + self._enh_parents = Links(self, value) + else: + self._enh_parents.set_links(value) + + @property + def enh_children(self): + """Return a list of (child, deprel) enhanced dependencies. + + To get just the child nodes (without deprels) use + `enhanced_children = node.enh_children.nodes` + """ + if self._enh_children is None: + self._enh_children = Links(self, None) + return self._enh_children + @property def raw_deps(self): """String serialization of enhanced dependencies as stored in CoNLL-U files. pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy