From 68b3eab8d2ad0294cc54f94b3298084105654327 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 24 Apr 2021 11:47:40 +0200 Subject: [PATCH 001/871] udapy -TMX attributes=ord,form,misc corefud.MarkCrossing < in.conllu --- udapi/block/corefud/markcrossing.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 udapi/block/corefud/markcrossing.py diff --git a/udapi/block/corefud/markcrossing.py b/udapi/block/corefud/markcrossing.py new file mode 100644 index 00000000..81136ec9 --- /dev/null +++ b/udapi/block/corefud/markcrossing.py @@ -0,0 +1,28 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkCrossing(Block): + """Find mentions with crossing spans.""" + + def __init__(self, same_cluster_only=False, continuous_only=False, print_form=False, **kwargs): + super().__init__(**kwargs) + self.same_cluster_only = same_cluster_only + self.continuous_only = continuous_only + self.print_form = print_form + + def _print(self, mention): + if self.print_form: + return ' '.join([w.form for w in mention.words]) + else: + return mention.span + + def process_node(self, node): + if len(node.coref_mentions) > 1: + for mA, mB in itertools.combinations(node.coref_mentions, 2): + if not (set(mA.words) <= set(mB.words)) and not (set(mB.words) <= set(mA.words)): + if self.same_cluster_only and mA.cluster != mB.cluster: + continue + if self.continuous_only and (',' in mA.span or ',' in mB.span): + continue + node.misc['Mark'] = f'cross:{self._print(mA)}+{self._print(mB)}' From bf866b26a40da205ea71119e7fca130ed965ffe1 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 2 May 2021 14:16:51 +0200 Subject: [PATCH 002/871] prevent creating a cycle in case of () i.e. paired punctuation with no words between, which is causing a non-projective gap. Fix #90 --- udapi/block/ud/fixpunct.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index cc34a0d0..6fa2da8f 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -232,16 +232,19 @@ def _fix_pair(self, root, opening_node, closing_node): # let's break this rule. if len(heads) == 0: heads = punct_heads - if len(heads) == 1: + # If there are no nodes between the opening and closing mark (), + # let's treat the marks as any other (non-pair) punctuation. + if len(heads) == 0: + return + elif len(heads) == 1: opening_node.parent = heads[0] closing_node.parent = heads[0] - self._punct_type[opening_node.ord] = 'opening' - self._punct_type[closing_node.ord] = 'closing' - elif len(heads) > 1: + else: opening_node.parent = sorted(heads, key=lambda n: n.descendants(add_self=1)[0].ord)[0] closing_node.parent = sorted(heads, key=lambda n: -n.descendants(add_self=1)[-1].ord)[0] - self._punct_type[opening_node.ord] = 'opening' - self._punct_type[closing_node.ord] = 'closing' + + self._punct_type[opening_node.ord] = 'opening' + self._punct_type[closing_node.ord] = 'closing' # In rare cases, non-projective gaps may remain. Let's dirty fix these! # E.g. in "the (lack of) reproducibility", the closing parenthesis From 8327a45c81317d61bae945acfab743b01d5693df Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 09:49:28 +0200 Subject: [PATCH 003/871] other characters need to be escaped in TeX --- udapi/block/write/tikz.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/write/tikz.py b/udapi/block/write/tikz.py index 58f53a3d..43417c61 100644 --- a/udapi/block/write/tikz.py +++ b/udapi/block/write/tikz.py @@ -91,6 +91,9 @@ def after_process_document(self, doc): logging.info('Use pdflatex to compile the output') super().after_process_document(doc) + def _tex_escape(self, string): + return string.replace('_', r'\_').replace('$', '\$').replace('[', '$[$').replace(']', '$]$') + def process_tree(self, tree): print(r'\begin{dependency}') print(r'\begin{deptext}') @@ -109,7 +112,7 @@ def process_tree(self, tree): lines = ['' for _ in self.node_attributes] for node in nodes: - values = [v.replace('_', r'\_') for v in node.get_attrs(self.node_attributes)] + values = [self._tex_escape(v) for v in node.get_attrs(self.node_attributes)] max_len = max(len(value) for value in values) for index, value in enumerate(values): if node.ord > 1: From 6bbdfc30b5f042f28bbdb19a92713e03707cc26b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 09:51:14 +0200 Subject: [PATCH 004/871] document how to use write.TextModeTrees in LaTeX & tiny improvements --- udapi/block/write/textmodetrees.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index fb38c22a..be2f999d 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -127,6 +127,14 @@ class TextModeTrees(BaseWriter): which is useful for printing subtrees using ``node.draw()``, which is internally implemented using this block. + For use in LaTeX, you can insert the output of this block (without colors) + into \begin{verbatim}...\end{verbatim}, but you need to compile with pdflatex (xelatex not supported) + and you must add the following code into the preambule:: + + \\usepackage{pmboxdraw} + \DeclareUnicodeCharacter{256D}{\textSFi} %╭ + \DeclareUnicodeCharacter{2570}{\textSFii} %╰ + SEE ALSO :py:class:`.TextModeTreesHtml` """ @@ -205,7 +213,7 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind self.mark_re = re.compile(mark + '=') self.comment_mark_re = re.compile(r'^ %s = ' % mark, re.M) self._index_of = [] - self._gaps = [] + self._gaps = collections.Counter() self.lines = [] self.lengths = [] @@ -255,7 +263,6 @@ def process_tree(self, root): # Precompute the number of non-projective gaps for each subtree if self.minimize_cross: - self._gaps = collections.Counter() self._compute_gaps(root) # Precompute lines for printing @@ -291,7 +298,7 @@ def process_tree(self, root): # sorting the stack to minimize crossings of edges if self.minimize_cross: - stack = sorted(stack, key=lambda x: -self._gaps[x.ord]) + stack.sort(key=lambda x: -self._gaps[x.ord]) if self.layout == 'classic': for idx, node in enumerate(allnodes): From 4a8e21aa7c920f91214010781dafe61e5567d297 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 12:01:22 +0200 Subject: [PATCH 005/871] corefud.PrintMentions for printing mentions with various properties always highlighting just a single mention per tree --- udapi/block/corefud/printmentions.py | 55 ++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 udapi/block/corefud/printmentions.py diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py new file mode 100644 index 00000000..250474ce --- /dev/null +++ b/udapi/block/corefud/printmentions.py @@ -0,0 +1,55 @@ +from udapi.core.block import Block +import udapi.core.coref + +class PrintMentions(Block): + """Print mentions with various properties.""" + + def __init__(self, continuous='include', treelet='include', + oneword='include', singleton='include', **kwargs): + super().__init__(**kwargs) + self.continuous = self._convert(continuous) + self.treelet = self._convert(treelet) + self.oneword = self._convert(oneword) + self.singleton = self._convert(singleton) + + + def _convert(self, value): + if value in {'include', 'exclude', 'only'}: + return value + if value == 1: + return 'only' + if value == 0: + return 'exclude' + raise ValueError('unknown value ' + value) + + def _ok(self, condition, value): + if value == 'include': + return True + return (condition and value == 'only') or (not condition and value=='exclude') + + def process_document(self, doc): + for cluster in doc.coref_clusters.values(): + if not self._ok(len(cluster.mentions) == 1, self.singleton): + continue + + for mention in cluster.mentions: + if not self._ok(len(mention.words) == 1, self.oneword): + continue + if not self._ok(',' in mention.span, self.continuous): + continue + + heads, mwords = 0, set(mention.words) + for w in mention.words: + if w.parent: + heads += 0 if w.parent in mwords else 1 + else: + heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 + if not self._ok(heads <= 1, self.treelet): + continue + + for w in mention.words: + w.misc['Mark'] = 1 + mention.head.root.draw() + for w in mention.words: + del w.misc['Mark'] + From 3c995ce236e103d26f6dcadda4535e6d4f758d7b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 12:06:44 +0200 Subject: [PATCH 006/871] oops, continuous == no comma in span --- udapi/block/corefud/printmentions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 250474ce..b226d909 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -35,7 +35,7 @@ def process_document(self, doc): for mention in cluster.mentions: if not self._ok(len(mention.words) == 1, self.oneword): continue - if not self._ok(',' in mention.span, self.continuous): + if not self._ok(',' not in mention.span, self.continuous): continue heads, mwords = 0, set(mention.words) From 77ed69c83c4af449970c24ddefee1b14b908dca8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 13:25:24 +0200 Subject: [PATCH 007/871] html output and other options --- udapi/block/corefud/printmentions.py | 51 +++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index b226d909..3dd0e51d 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -1,17 +1,35 @@ from udapi.core.block import Block import udapi.core.coref +from udapi.block.write.textmodetreeshtml import TextModeTreesHtml +from udapi.block.write.textmodetrees import TextModeTrees class PrintMentions(Block): """Print mentions with various properties.""" - def __init__(self, continuous='include', treelet='include', - oneword='include', singleton='include', **kwargs): + def __init__(self, continuous='include', treelet='include', forest='include', + almost_forest='include', oneword='include', singleton='include', + max_trees=100, html=False, + print_sent_id=True, print_text=True, add_empty_line=True, indent=1, + minimize_cross=True, color=True, attributes='form,upos,deprel', + print_undef_as='_', print_doc_meta=True, print_comments=False, + mark='(Mark)', hints=True, layout='classic', + **kwargs): super().__init__(**kwargs) self.continuous = self._convert(continuous) self.treelet = self._convert(treelet) + self.forest = self._convert(forest) + self.almost_forest = self._convert(almost_forest) self.oneword = self._convert(oneword) self.singleton = self._convert(singleton) + self.max_trees = max_trees + self.html = html + print_class = TextModeTreesHtml if html else TextModeTrees + self.print_block = print_class( + print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, + minimize_cross=minimize_cross, color=color, attributes=attributes, + print_undef_as=print_undef_as, print_doc_meta=print_doc_meta, print_comments=print_comments, + mark=mark, hints=hints, layout=layout) def _convert(self, value): if value in {'include', 'exclude', 'only'}: @@ -22,12 +40,29 @@ def _convert(self, value): return 'exclude' raise ValueError('unknown value ' + value) + def before_process_document(self, document): + self.print_block.before_process_document(document) + + def after_process_document(self, document): + self.print_block.after_process_document(document) + def _ok(self, condition, value): if value == 'include': return True return (condition and value == 'only') or (not condition and value=='exclude') + def _is_forest(self, mention, mwords, almost): + for w in mention.words: + for ch in w.children(): + if ch not in mwords: + if not almost: + return False + if not w.parent or w.parent in mwords or ch.udeprel not in {'case', 'cc', 'punct', 'conj'}: + return False + return True + def process_document(self, doc): + printed_trees = 0 for cluster in doc.coref_clusters.values(): if not self._ok(len(cluster.mentions) == 1, self.singleton): continue @@ -46,10 +81,18 @@ def process_document(self, doc): heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 if not self._ok(heads <= 1, self.treelet): continue + if self.forest != 'include' and not self._ok(self._is_forest(mention, mwords, False), self.forest): + continue + if self.almost_forest != 'include' and not self._ok(self._is_forest(mention, mwords, True), self.almost_forest): + continue for w in mention.words: w.misc['Mark'] = 1 - mention.head.root.draw() + if self.max_trees: + printed_trees += 1 + if printed_trees > self.max_trees: + return + #print(f"{printed_trees}/{self.max_trees}") + self.print_block.process_tree(mention.head.root) for w in mention.words: del w.misc['Mark'] - From c6c57ecfb0ee0009e3ee9dbcce900cd6c4df6ddf Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 13:44:41 +0200 Subject: [PATCH 008/871] more exceptions --- udapi/block/corefud/printmentions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 3dd0e51d..5146a34e 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -57,7 +57,7 @@ def _is_forest(self, mention, mwords, almost): if ch not in mwords: if not almost: return False - if not w.parent or w.parent in mwords or ch.udeprel not in {'case', 'cc', 'punct', 'conj'}: + if not w.parent or w.parent in mwords or ch.udeprel not in {'case', 'cc', 'punct', 'conj', 'appos', 'cop', 'aux'}: return False return True From 45a6fe46f3fb6f9e3bcc9a43f3b2ab3d3b796fb7 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 15:40:12 +0200 Subject: [PATCH 009/871] option "empty" --- udapi/block/corefud/printmentions.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 5146a34e..08fef23c 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -8,7 +8,7 @@ class PrintMentions(Block): def __init__(self, continuous='include', treelet='include', forest='include', almost_forest='include', oneword='include', singleton='include', - max_trees=100, html=False, + empty='include', max_trees=100, html=False, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color=True, attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, @@ -21,6 +21,7 @@ def __init__(self, continuous='include', treelet='include', forest='include', self.almost_forest = self._convert(almost_forest) self.oneword = self._convert(oneword) self.singleton = self._convert(singleton) + self.empty = self._convert(empty) self.max_trees = max_trees self.html = html @@ -73,6 +74,10 @@ def process_document(self, doc): if not self._ok(',' not in mention.span, self.continuous): continue + empty_mwords = [w for w in mention.words if w.is_empty()] + if not self._ok(len(empty_mwords) > 0, self.empty): + continue + heads, mwords = 0, set(mention.words) for w in mention.words: if w.parent: From 996517090e9c3d2da2757cf76c9e52bd880ec04a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 17:09:23 +0200 Subject: [PATCH 010/871] AnCora in CorefUD has all deprels=dep, so use upos instead --- udapi/block/corefud/printmentions.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 08fef23c..521b4cc6 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -52,13 +52,20 @@ def _ok(self, condition, value): return True return (condition and value == 'only') or (not condition and value=='exclude') + def _is_auxiliary(self, node): + if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'cop', 'aux'}: + return True + if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ'}: + return True + return False + def _is_forest(self, mention, mwords, almost): for w in mention.words: for ch in w.children(): if ch not in mwords: if not almost: return False - if not w.parent or w.parent in mwords or ch.udeprel not in {'case', 'cc', 'punct', 'conj', 'appos', 'cop', 'aux'}: + if not (w.parent and w.parent not in mwords and self._is_auxiliary(ch)): return False return True From d4bbdfd232bbf52562f389a8e39c8bd9709834bb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 13 May 2021 12:38:22 +0200 Subject: [PATCH 011/871] corefud.PrintMentions almost_continuous --- udapi/block/corefud/printmentions.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 521b4cc6..60de30eb 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -6,8 +6,8 @@ class PrintMentions(Block): """Print mentions with various properties.""" - def __init__(self, continuous='include', treelet='include', forest='include', - almost_forest='include', oneword='include', singleton='include', + def __init__(self, continuous='include', almost_continuous='include', treelet='include', + forest='include', almost_forest='include', oneword='include', singleton='include', empty='include', max_trees=100, html=False, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color=True, attributes='form,upos,deprel', @@ -16,6 +16,7 @@ def __init__(self, continuous='include', treelet='include', forest='include', **kwargs): super().__init__(**kwargs) self.continuous = self._convert(continuous) + self.almost_continuous = self._convert(almost_continuous) self.treelet = self._convert(treelet) self.forest = self._convert(forest) self.almost_forest = self._convert(almost_forest) @@ -69,6 +70,19 @@ def _is_forest(self, mention, mwords, almost): return False return True + def _is_almost_continuous(self, mention): + if ',' not in mention.span: + return True + nonempty = [w for w in mention.words if not w.is_empty()] + if not nonempty: + return True + mwords = set(mention.words) + gap_nodes = [w for w in mention.head.root.descendants if w > nonempty[0] and w < nonempty[-1] and not w in mwords] + for gap_node in gap_nodes: + if not gap_node.is_empty(): + return False + return True + def process_document(self, doc): printed_trees = 0 for cluster in doc.coref_clusters.values(): @@ -80,6 +94,8 @@ def process_document(self, doc): continue if not self._ok(',' not in mention.span, self.continuous): continue + if self.almost_continuous != 'include' and not self._ok(self._is_almost_continuous(mention), self.almost_continuous): + continue empty_mwords = [w for w in mention.words if w.is_empty()] if not self._ok(len(empty_mwords) > 0, self.empty): From 21c6a7f07c000430d02d8b58b739f53825085a06 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 13 May 2021 18:21:01 +0200 Subject: [PATCH 012/871] TextModeTrees color=0 now highlights marked nodes with **asterisks** --- udapi/block/write/textmodetrees.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index be2f999d..d427098a 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -359,11 +359,14 @@ def add_node(self, idx, node): if not node.is_root(): values = node.get_attrs(self.attrs, undefs=self.print_undef_as) self.lengths[idx] += 1 + len(' '.join(values)) + marked = self.is_marked(node) if self.color: - marked = self.is_marked(node) for i, attr in enumerate(self.attrs): values[i] = self.colorize_attr(attr, values[i], marked) - self.lines[idx] += ' ' + ' '.join(values) + if not self.color and marked: + self.lines[idx] += ' **' + ' '.join(values) + '**' + else: + self.lines[idx] += ' ' + ' '.join(values) def is_marked(self, node): """Should a given node be highlighted?""" From ac7360d47428e296d462c89973a9282f2bb1f84e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 13 May 2021 18:59:31 +0200 Subject: [PATCH 013/871] PrintMentions shuffle=1 by default (and shuffle=0 sorts mentions) --- udapi/block/corefud/printmentions.py | 82 +++++++++++++++------------- udapi/block/write/textmodetrees.py | 1 + 2 files changed, 46 insertions(+), 37 deletions(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 60de30eb..0c1d4e79 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -1,5 +1,5 @@ +import random from udapi.core.block import Block -import udapi.core.coref from udapi.block.write.textmodetreeshtml import TextModeTreesHtml from udapi.block.write.textmodetrees import TextModeTrees @@ -8,7 +8,7 @@ class PrintMentions(Block): def __init__(self, continuous='include', almost_continuous='include', treelet='include', forest='include', almost_forest='include', oneword='include', singleton='include', - empty='include', max_trees=100, html=False, + empty='include', max_trees=0, html=False, shuffle=True, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color=True, attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, @@ -26,6 +26,9 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i self.max_trees = max_trees self.html = html + self.shuffle = shuffle + if shuffle: + random.seed(42) print_class = TextModeTreesHtml if html else TextModeTrees self.print_block = print_class( print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, @@ -84,43 +87,48 @@ def _is_almost_continuous(self, mention): return True def process_document(self, doc): - printed_trees = 0 + mentions = [] for cluster in doc.coref_clusters.values(): - if not self._ok(len(cluster.mentions) == 1, self.singleton): - continue + if self._ok(len(cluster.mentions) == 1, self.singleton): + mentions.extend(cluster.mentions) + if self.shuffle: + random.shuffle(mentions) + else: + mentions.sort() - for mention in cluster.mentions: - if not self._ok(len(mention.words) == 1, self.oneword): - continue - if not self._ok(',' not in mention.span, self.continuous): - continue - if self.almost_continuous != 'include' and not self._ok(self._is_almost_continuous(mention), self.almost_continuous): - continue + printed_trees = 0 + for mention in mentions: + if not self._ok(len(mention.words) == 1, self.oneword): + continue + if not self._ok(',' not in mention.span, self.continuous): + continue + if self.almost_continuous != 'include' and not self._ok(self._is_almost_continuous(mention), self.almost_continuous): + continue - empty_mwords = [w for w in mention.words if w.is_empty()] - if not self._ok(len(empty_mwords) > 0, self.empty): - continue + empty_mwords = [w for w in mention.words if w.is_empty()] + if not self._ok(len(empty_mwords) > 0, self.empty): + continue - heads, mwords = 0, set(mention.words) - for w in mention.words: - if w.parent: - heads += 0 if w.parent in mwords else 1 - else: - heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 - if not self._ok(heads <= 1, self.treelet): - continue - if self.forest != 'include' and not self._ok(self._is_forest(mention, mwords, False), self.forest): - continue - if self.almost_forest != 'include' and not self._ok(self._is_forest(mention, mwords, True), self.almost_forest): - continue + heads, mwords = 0, set(mention.words) + for w in mention.words: + if w.parent: + heads += 0 if w.parent in mwords else 1 + else: + heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 + if not self._ok(heads <= 1, self.treelet): + continue + if self.forest != 'include' and not self._ok(self._is_forest(mention, mwords, False), self.forest): + continue + if self.almost_forest != 'include' and not self._ok(self._is_forest(mention, mwords, True), self.almost_forest): + continue - for w in mention.words: - w.misc['Mark'] = 1 - if self.max_trees: - printed_trees += 1 - if printed_trees > self.max_trees: - return - #print(f"{printed_trees}/{self.max_trees}") - self.print_block.process_tree(mention.head.root) - for w in mention.words: - del w.misc['Mark'] + for w in mention.words: + w.misc['Mark'] = 1 + if self.max_trees: + printed_trees += 1 + if printed_trees > self.max_trees: + print(f'######## Only first {self.max_trees} trees printed. Use max_trees=0 to see all.') + return + self.print_block.process_tree(mention.head.root) + for w in mention.words: + del w.misc['Mark'] diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index d427098a..f3f6e007 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -365,6 +365,7 @@ def add_node(self, idx, node): values[i] = self.colorize_attr(attr, values[i], marked) if not self.color and marked: self.lines[idx] += ' **' + ' '.join(values) + '**' + self.lengths[idx] += 4 else: self.lines[idx] += ' ' + ' '.join(values) From 93269127f43089e79dc250cfd4968bf0d3aff811 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 13 May 2021 19:20:46 +0200 Subject: [PATCH 014/871] printMentions print_other_forms=5 by default --- udapi/block/corefud/printmentions.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 0c1d4e79..4a122ccb 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -1,4 +1,5 @@ import random +from collections import Counter from udapi.core.block import Block from udapi.block.write.textmodetreeshtml import TextModeTreesHtml from udapi.block.write.textmodetrees import TextModeTrees @@ -8,7 +9,7 @@ class PrintMentions(Block): def __init__(self, continuous='include', almost_continuous='include', treelet='include', forest='include', almost_forest='include', oneword='include', singleton='include', - empty='include', max_trees=0, html=False, shuffle=True, + empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color=True, attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, @@ -29,6 +30,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i self.shuffle = shuffle if shuffle: random.seed(42) + self.print_other_forms = print_other_forms print_class = TextModeTreesHtml if html else TextModeTrees self.print_block = print_class( print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, @@ -129,6 +131,20 @@ def process_document(self, doc): if printed_trees > self.max_trees: print(f'######## Only first {self.max_trees} trees printed. Use max_trees=0 to see all.') return + + this_form = ' '.join([w.form for w in mention.words]) + print("# Mention = " + this_form) + if self.print_other_forms: + counter = Counter() + for m in mention.cluster.mentions: + forms = ' '.join([w.form for w in m.words]) + if forms != this_form: + counter[forms] += 1 + if counter: + print(f"# {min(len(counter), self.print_other_forms)} other forms:", end='') + for form, count in counter.most_common(self.print_other_forms): + print(f' "{form}"({count})', end='') + print() self.print_block.process_tree(mention.head.root) for w in mention.words: del w.misc['Mark'] From 61ba542511507546d0a718458ebf3f1ac85eb903 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 14 May 2021 14:03:21 +0200 Subject: [PATCH 015/871] better copula handling --- udapi/block/corefud/printmentions.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 4a122ccb..e4c4cd5d 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -58,20 +58,27 @@ def _ok(self, condition, value): return True return (condition and value == 'only') or (not condition and value=='exclude') - def _is_auxiliary(self, node): - if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'cop', 'aux'}: + def _is_auxiliary_etc(self, node): + if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'aux', 'vocative'}: return True - if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ'}: + if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}: return True return False def _is_forest(self, mention, mwords, almost): for w in mention.words: - for ch in w.children(): + # UD unfortunatelly does not use the copula-as-head style for copula construction, + # so e.g. in "It is my fault", "fault" is the root of the tree and all other words its children. + # However, in the cop-as-head stule, only "my" would depend on "fault" (and should be part of the mention). + # It is difficult to tell apart which w.children are related to w and which to the copula. + # We thus ignore these cases completely (we expect any child is potentially related to the copula). + if any(ch.udeprel == 'cop' for ch in w.children): + continue + for ch in w.children: if ch not in mwords: if not almost: return False - if not (w.parent and w.parent not in mwords and self._is_auxiliary(ch)): + if not (w.parent and w.parent not in mwords and self._is_auxiliary_etc(ch)): return False return True From b08bfdfa6c2ec1759d13f49b45e3c59e74fdcfff Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 14 May 2021 14:16:13 +0200 Subject: [PATCH 016/871] aux should not be in the list when copula constructions are handled elsewhere --- udapi/block/corefud/printmentions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index e4c4cd5d..33efdd66 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -59,7 +59,7 @@ def _ok(self, condition, value): return (condition and value == 'only') or (not condition and value=='exclude') def _is_auxiliary_etc(self, node): - if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'aux', 'vocative'}: + if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'vocative'}: return True if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}: return True From 39134031a5c5e7544dab32f7ec83875dfde35f01 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 18 May 2021 04:07:12 +0200 Subject: [PATCH 017/871] corefud.PrintMentions print_total=1 --- udapi/block/corefud/printmentions.py | 51 ++++++++++++++++------------ 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 33efdd66..e26ee6e2 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -10,6 +10,7 @@ class PrintMentions(Block): def __init__(self, continuous='include', almost_continuous='include', treelet='include', forest='include', almost_forest='include', oneword='include', singleton='include', empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, + print_total=True, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color=True, attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, @@ -31,6 +32,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i if shuffle: random.seed(42) self.print_other_forms = print_other_forms + self.print_total = print_total, print_class = TextModeTreesHtml if html else TextModeTrees self.print_block = print_class( print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, @@ -105,7 +107,7 @@ def process_document(self, doc): else: mentions.sort() - printed_trees = 0 + seen_trees = 0 for mention in mentions: if not self._ok(len(mention.words) == 1, self.oneword): continue @@ -133,25 +135,32 @@ def process_document(self, doc): for w in mention.words: w.misc['Mark'] = 1 - if self.max_trees: - printed_trees += 1 - if printed_trees > self.max_trees: - print(f'######## Only first {self.max_trees} trees printed. Use max_trees=0 to see all.') + + seen_trees += 1 + if self.max_trees and seen_trees > self.max_trees: + if not self.print_total: + print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') return + else: + this_form = ' '.join([w.form for w in mention.words]) + print("# Mention = " + this_form) + if self.print_other_forms: + counter = Counter() + for m in mention.cluster.mentions: + forms = ' '.join([w.form for w in m.words]) + if forms != this_form: + counter[forms] += 1 + if counter: + print(f"# {min(len(counter), self.print_other_forms)} other forms:", end='') + for form, count in counter.most_common(self.print_other_forms): + print(f' "{form}"({count})', end='') + print() + self.print_block.process_tree(mention.head.root) + for w in mention.words: + del w.misc['Mark'] + + if self.print_total: + if self.max_trees and seen_trees > self.max_trees: + print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') + print(f'######## Total matching/all mentions = {seen_trees} / {len(mentions)}') - this_form = ' '.join([w.form for w in mention.words]) - print("# Mention = " + this_form) - if self.print_other_forms: - counter = Counter() - for m in mention.cluster.mentions: - forms = ' '.join([w.form for w in m.words]) - if forms != this_form: - counter[forms] += 1 - if counter: - print(f"# {min(len(counter), self.print_other_forms)} other forms:", end='') - for form, count in counter.most_common(self.print_other_forms): - print(f' "{form}"({count})', end='') - print() - self.print_block.process_tree(mention.head.root) - for w in mention.words: - del w.misc['Mark'] From 0d98b5fa5ddeefd991f79482cfcf20e1657dab9b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 21 May 2021 19:08:14 +0200 Subject: [PATCH 018/871] prevent paired punct attached non-projectively fixes #87 --- udapi/block/ud/fixpunct.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index 6fa2da8f..95cb40d0 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -2,7 +2,7 @@ Punctuation in Universal Dependencies has the tag PUNCT, dependency relation punct, and is always attached projectively, usually to the head of a neighboring subtree -to its left or right. +to its left or right (see https://universaldependencies.org/u/dep/punct.html). Punctuation normally does not have children. If it does, we will fix it first. This block tries to re-attach punctuation projectively and according to the guidelines. @@ -236,12 +236,18 @@ def _fix_pair(self, root, opening_node, closing_node): # let's treat the marks as any other (non-pair) punctuation. if len(heads) == 0: return - elif len(heads) == 1: - opening_node.parent = heads[0] - closing_node.parent = heads[0] else: - opening_node.parent = sorted(heads, key=lambda n: n.descendants(add_self=1)[0].ord)[0] - closing_node.parent = sorted(heads, key=lambda n: -n.descendants(add_self=1)[-1].ord)[0] + # Ideally, there should be only a single head. + # If not, we could try e.g. to choose the "widests-span head": + # opening_node.parent = sorted(heads, key=lambda n: n.descendants(add_self=1)[0].ord)[0] + # closing_node.parent = sorted(heads, key=lambda n: -n.descendants(add_self=1)[-1].ord)[0] + # which often leads to selecting the same head for the opening and closing punctuation + # ignoring single words inside the paired punct which are non-projectively attached outside. + # However, this means that the paired punctuation will be attached non-projectively, + # which is forbidden by the UD guidelines. + # Thus, we will choose the nearest head, which is the only way how to prevent non-projectivities. + opening_node.parent = heads[0] + closing_node.parent = heads[-1] self._punct_type[opening_node.ord] = 'opening' self._punct_type[closing_node.ord] = 'closing' From 1eee540ad18c00525941cebc814a76c88550c027 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 24 Jun 2021 17:52:46 +0200 Subject: [PATCH 019/871] util.MarkDiff print_stats=6 prints top 6 changes --- udapi/block/util/markdiff.py | 43 ++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/markdiff.py b/udapi/block/util/markdiff.py index 22a7a03e..3d183f57 100644 --- a/udapi/block/util/markdiff.py +++ b/udapi/block/util/markdiff.py @@ -1,5 +1,7 @@ """util.MarkDiff is a special block for marking differences between parallel trees.""" +import collections import difflib +import pprint from udapi.core.block import Block @@ -7,13 +9,25 @@ class MarkDiff(Block): """Mark differences between parallel trees.""" def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc', - mark=1, add=False, **kwargs): - """Create the Mark block object.""" + mark=1, add=False, print_stats=0, **kwargs): + """Create the Mark block object. + Params: + gold_zone: Which of the zones should be treated as gold? + (The changes are interpreted as from a "pred"=predicted zone into the gold zone.) + attributes: Which node attributes should be considered when searching for diffs? + The tree topology, i.e. node parent is always considered. + mark: What value should be used in `node.misc['Mark']` of the differing nodes? + add: If False, node.misc attributes Mark, ToDo and Bug will be deleted before running this block, + so that the marked_only option (e.g. via `udapy -TM`) prints only nodes marked by this block. + print_stats: How many lines of statistics should be printed? -1 means all. + """ super().__init__(**kwargs) self.gold_zone = gold_zone self.attrs = attributes.split(',') self.mark = mark self.add = add + self.print_stats = print_stats + self.stats = collections.Counter() def process_tree(self, tree): gold_tree = tree.bundle.get_tree(self.gold_zone) @@ -49,6 +63,31 @@ def process_tree(self, tree): if alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: p_node.misc['Mark'] = self.mark g_node.misc['Mark'] = self.mark + self.stats['ONLY-PARENT-CHANGED'] += 1 else: for node in pred_nodes[pred_lo:pred_hi] + gold_nodes[gold_lo:gold_hi]: node.misc['Mark'] = self.mark + if self.print_stats: + if edit == 'replace': + # first n nodes are treated as aligned, the rest is treated as ADDED/DELETED + n = min(pred_hi - pred_lo, gold_hi - gold_lo) + for p_node, g_node in zip(pred_nodes[pred_lo:pred_lo + n], gold_nodes[gold_lo:gold_lo + n]): + for attr in self.attrs: + p_value, g_value = p_node._get_attr(attr), g_node._get_attr(attr) + if p_value != g_value: + self.stats[f'{attr.upper()}: {p_value} -> {g_value}'] += 1 + if alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + self.stats['PARENT-CHANGED'] += 1 + pred_lo, gold_lo = pred_lo + n, gold_lo + n + for node in gold_nodes[gold_lo:gold_hi]: + self.stats['ADD-WORD'] += 1 + self.stats['ADD-LEMMA: ' + node.lemma] += 1 + for node in pred_nodes[pred_lo:pred_hi]: + self.stats['DELETE-WORD'] += 1 + self.stats['DELETE-LEMMA: ' + node.lemma] += 1 + + def process_end(self): + if self.print_stats: + how_many = None if self.print_stats in (-1, '-1') else self.print_stats + for edit, count in self.stats.most_common(how_many): + print(f'{count:4} {edit}') From 79f4cb5e1839b8250a6398f5fd00182bddb5417e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 7 Jul 2021 16:58:54 +0200 Subject: [PATCH 020/871] prevent fatal errors in eval.F1 `eval.F1` calls `self.before_process_document(None)` --- udapi/core/basewriter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index 0db348a8..cc72c6e7 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -45,7 +45,8 @@ def next_filename(self): return self.files.next_filename() def before_process_document(self, document): - udapi.core.coref.store_coref_to_misc(document) + if document: + udapi.core.coref.store_coref_to_misc(document) if self.orig_files == '': logging.info('Writing to filehandle.') sys.stdout = self.files.filehandle From 368349003b4388ac9217ef1f7a4e844469e933fb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 23 Jul 2021 16:57:51 +0200 Subject: [PATCH 021/871] eval.F1 should not fail if some node attributes are None --- udapi/block/eval/f1.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/eval/f1.py b/udapi/block/eval/f1.py index 982e4190..9f265ac7 100644 --- a/udapi/block/eval/f1.py +++ b/udapi/block/eval/f1.py @@ -110,8 +110,8 @@ def process_tree(self, tree): return self.visited_zones[tree.zone] += 1 - pred_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in tree.descendants] - gold_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in gold_tree.descendants] + pred_tokens = ['_'.join(n.get_attrs(self.attrs, undefs='None')) for n in tree.descendants] + gold_tokens = ['_'.join(n.get_attrs(self.attrs, undefs='None')) for n in gold_tree.descendants] # lcs("abc", "acb") can be either "ab" or "ac". # We want to prefer the LCS with the highest number of non-focused tokens. From 2fe54cde2d8465527092b8804b624d1e35bdf7cf Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 10 Sep 2021 11:41:42 +0200 Subject: [PATCH 022/871] Archiving work I did with German HDT between UD 2.8 and 2.9. I only now realize that I could have used the existing block ud.de.AddMwt. However, there are some differences, so for the time being, I am keeping both blocks. --- udapi/block/ud/de/fixhdt.py | 109 ++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 udapi/block/ud/de/fixhdt.py diff --git a/udapi/block/ud/de/fixhdt.py b/udapi/block/ud/de/fixhdt.py new file mode 100644 index 00000000..a3792a96 --- /dev/null +++ b/udapi/block/ud/de/fixhdt.py @@ -0,0 +1,109 @@ +""" +Block to fix annotation of UD German-HDT. + +It was created independently of ud.de.AddMwt but it aims to do essentially the +same thing. Future work: make the two blocks converge. + +Currently known differences: +- This block covers a wider range of contractions. +- This block generates morphological features for the syntactic words. +- This block does not touch words that look like contractions but do not have PronType=Art (this is a reliable indicator in HDT). +- This block overrides the default attachment when the original relation is root, conj, reparandum. +- The other block takes advantage of the generic class ud.AddMwt, so it does not have to re-invent common procedures. +""" +from udapi.core.block import Block +import logging +import re + +class FixHDT(Block): + + def process_node(self, node): + # PronType=Art with ADP is wrong. Fused prepositions and articles should be decomposed in UD. + # The following contractions have been observed: + # a. am ans aufs beim durchs fürs hinterm hinters im ins übers ums unterm unters vom vorm vors z. zum zur + if node.upos == 'ADP' and node.feats['PronType'] == 'Art': + if re.match("^(a\.|am|ans|aufs|beim|durchs|fürs|hinter[ms]|im|ins|übers|ums|unter[ms]|vom|vor[ms]|z\.|zu[mr])$", node.form, re.IGNORECASE): + # We need two nodes instead of one. Create a node. + # The parent should not be the root but unfortunately it is not guaranteed. + node2 = node.create_child() + node2.shift_after_node(node) + if not re.match(r"^(root|conj|reparandum)$", node.udeprel): + node2.parent = node.parent + node.deprel = 'case' + node2.deprel = 'det' + mwt = node.root.create_multiword_token(form=node.form, words=[node, node2], misc=node.misc) + node.misc['SpaceAfter'] = '' + # We want to respect the original letter case in the forms of the syntactic words. + # We can use the isupper() method to find out whether all letters are uppercase. + # However, detecting first-letter capitalization requires more work. + up = 2 if mwt.form.isupper() else 1 if mwt.form[:1].isupper() else 0 + up2 = 2 if up == 2 else 0 + if re.match(r"^(a\.|am|ans)$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'an') + node.lemma = 'an' + elif re.match(r"^aufs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'auf') + node.lemma = 'auf' + elif re.match(r"^beim$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'bei') + node.lemma = 'bei' + elif re.match(r"^durchs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'durch') + node.lemma = 'durch' + elif re.match(r"^fürs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'für') + node.lemma = 'für' + elif re.match(r"^hinter[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'hinter') + node.lemma = 'hinter' + elif re.match(r"^(im|ins)$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'in') + node.lemma = 'in' + elif re.match(r"^übers$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'über') + node.lemma = 'über' + elif re.match(r"^ums$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'um') + node.lemma = 'um' + elif re.match(r"^unter[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'unter') + node.lemma = 'unter' + elif re.match(r"^vom$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'von') + node.lemma = 'von' + elif re.match(r"^vor[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'vor') + node.lemma = 'vor' + elif re.match(r"^(z\.|zu[mr])$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'zu') + node.lemma = 'zu' + node.upos = 'ADP' + node.xpos = 'APPR' + node.feats = '_' + node.feats['AdpType'] = 'Prep' + # We must use search() because match() only checks at the beginning of the string. + if re.search("[m\.]$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'dem') + node2.feats = 'Case=Dat|Definite=Def|Gender=Masc,Neut|Number=Sing|PronType=Art' + node.feats['Case'] = 'Dat' + node2.lemma = 'der' + elif re.search("s$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'das') + node2.feats = 'Case=Acc|Definite=Def|Gender=Neut|Number=Sing|PronType=Art' + node.feats['Case'] = 'Acc' + node2.lemma = 'der' + elif re.search("r$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'der') + node2.feats = 'Case=Dat|Definite=Def|Gender=Fem|Number=Sing|PronType=Art' + node.feats['Case'] = 'Dat' + node2.lemma = 'der' + node2.upos = 'DET' + node2.xpos = 'ART' + +def mimic_case(up, x): + if up >= 2: + return x.upper() + elif up == 1: + return x[:1].upper() + x[1:].lower() + else: + return x.lower() From 33d8e1ce46c22f614e7fc451422278039dfafbb0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 20:15:59 +0200 Subject: [PATCH 023/871] German preposition-article contractions. --- udapi/block/ud/de/addmwt.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/de/addmwt.py b/udapi/block/ud/de/addmwt.py index 23ac54f9..18778a4a 100644 --- a/udapi/block/ud/de/addmwt.py +++ b/udapi/block/ud/de/addmwt.py @@ -16,15 +16,16 @@ 'durchs': {'form': 'durch das', }, 'fürs': {'form': 'fürs das', }, 'hinterm': {'form': 'hinter dem', }, + 'hinters': {'form': 'hinter das', }, 'im': {'form': 'in dem', }, 'ins': {'form': 'in das', }, 'übers': {'form': 'über das', }, 'ums': {'form': 'um das', }, - 'unters': {'form': 'unter das', }, 'unterm': {'form': 'unter dem', }, + 'unters': {'form': 'unter das', }, 'vom': {'form': 'von dem', }, - 'vors': {'form': 'vor das', }, 'vorm': {'form': 'vor dem', }, + 'vors': {'form': 'vor das', }, 'zum': {'form': 'zu dem', }, 'zur': {'form': 'zu der', }, } From 1487de76a72fa42b95274bf4237af6fb7d55b7e8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 20:52:32 +0200 Subject: [PATCH 024/871] Archiving my fixes for Catalan and Spanish. --- udapi/block/ud/ca/__init__.py | 0 udapi/block/ud/ca/addmwt.py | 194 ++++++++++++++++++++++++++++++++++ udapi/block/ud/es/addmwt.py | 9 +- 3 files changed, 201 insertions(+), 2 deletions(-) create mode 100644 udapi/block/ud/ca/__init__.py create mode 100644 udapi/block/ud/ca/addmwt.py diff --git a/udapi/block/ud/ca/__init__.py b/udapi/block/ud/ca/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/ca/addmwt.py b/udapi/block/ud/ca/addmwt.py new file mode 100644 index 00000000..49b79da1 --- /dev/null +++ b/udapi/block/ud/ca/addmwt.py @@ -0,0 +1,194 @@ +"""Block ud.ca.AddMwt for heuristic detection of Catalan contractions. + +According to the UD guidelines, contractions such as "del" = "de el" +should be annotated using multi-word tokens. + +Note that this block should be used only for converting legacy conllu files. +Ideally a tokenizer should have already split the MWTs. +""" +import re +import udapi.block.ud.addmwt + +MWTS = { + 'al': {'form': 'a el', 'lemma': 'a el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'als': {'form': 'a els', 'lemma': 'a el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, + 'del': {'form': 'de el', 'lemma': 'de el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'dels': {'form': 'de els', 'lemma': 'de el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, + 'pel': {'form': 'per el', 'lemma': 'per el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'pels': {'form': 'per els', 'lemma': 'per el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, +} + +# shared values for all entries in MWTS +for v in MWTS.values(): + v['lemma'] = v['form'] + v['upos'] = 'ADP DET' + v['deprel'] = '* det' + # The following are the default values + # v['main'] = 0 # which of the two words will inherit the original children (if any) + # v['shape'] = 'siblings', # the newly created nodes will be siblings + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def __init__(self, verbpron=False, **kwargs): + super().__init__(**kwargs) + self.verbpron = verbpron + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + analysis = MWTS.get(node.form.lower(), None) + + if analysis is not None: + # Modify the default attachment of the new syntactic words in special situations. + if re.match(r'^(root|conj|reparandum)$', node.udeprel): + # Copy the dictionary so that we do not modify the original and do not affect subsequent usages. + analysis = analysis.copy() + analysis['shape'] = 'subtree' + return analysis + return None + + def fix_personal_pronoun(self, node): + # There is a mess in lemmas and features of personal pronouns. + if node.upos == 'PRON': + if re.match("^jo$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Nom|Number=Sing|Person=1|PronType=Prs' + if re.match("^(em|m'|-me|'m|me|m)$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc,Dat|Number=Sing|Person=1|PrepCase=Npr|PronType=Prs' + if re.match("^mi$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc|Number=Sing|Person=1|PrepCase=Pre|PronType=Prs' + if re.match("^tu$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Nom|Number=Sing|Person=2|Polite=Infm|PronType=Prs' + if re.match("^(et|t'|-te|'t|te|t)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc,Dat|Number=Sing|Person=2|Polite=Infm|PrepCase=Npr|PronType=Prs' + if re.match("^ti$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc|Number=Sing|Person=2|Polite=Infm|PrepCase=Pre|PronType=Prs' + # Strong forms of third person pronouns can be used as subjects or after preposition. + # Do not mark them as nominative (because of the prepositions). + if re.match("^ell$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^ella$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Fem|Number=Sing|Person=3|PronType=Prs' + if re.match("^(el|-lo|'l|lo)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^(la|-la)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs' + if re.match("^(l')$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^(ho|-ho)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs' + if re.match("^(li|-li)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Dat|Number=Sing|Person=3|PronType=Prs' + if re.match("^(es|s'|-se|'s|se|s)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc,Dat|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes' + if re.match("^si$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Person=3|PrepCase=Pre|PronType=Prs|Reflex=Yes' + # If nosaltres can be used after a preposition, we should not tag it as nominative. + if re.match("^nosaltres$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Number=Plur|Person=1|PronType=Prs' + # Nós is the majestic first person singular. In accusative and dative, it is identical to first person plural. + if re.match("^nós$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Number=Sing|Person=1|Polite=Form|PronType=Prs' + if re.match("^(ens|-nos|'ns|nos|ns)$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs' + if re.match("^vosaltres$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Plur|Person=2|PronType=Prs' + # Vós is the formal second person singular. In accusative and dative, it is identical to second person plural. + # Vostè is even more formal than vós. In accusative and dative, it is identical to third person singular. + if re.match("^(vós|vostè)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Sing|Person=2|Polite=Form|PronType=Prs' + if re.match("^vostès$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Plur|Person=2|Polite=Form|PronType=Prs' + if re.match("^(us|-vos|-us|vos)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs' + # Strong forms of third person pronouns can be used as subjects or after preposition. + # Do not mark them as nominative (because of the prepositions). + if re.match("^ells$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Masc|Number=Plur|Person=3|PronType=Prs' + if re.match("^elles$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Fem|Number=Plur|Person=3|PronType=Prs' + # Els is masculine accusative, or dative in any gender. + if re.match("^(els|-los|'ls|los|ls)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs' + if re.match("^(les|-les)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem|Number=Plur|Person=3|PronType=Prs' + # There are also "adverbial" pronominal clitics that can occur at direct object positions. + if re.match("^(en|n'|'n|-ne|n|ne)$", node.form, re.IGNORECASE): + node.lemma = 'en' + node.feats = 'Case=Gen|Person=3|PronType=Prs' + if re.match("^(hi|-hi)$", node.form, re.IGNORECASE): + node.lemma = 'hi' + node.feats = 'Case=Loc|Person=3|PronType=Prs' + + def report_suspicious_lemmas(self, node): + # There are offset issues of splitted multi_word_expressions. + # Sometimes a word gets the lemma of the neighboring word. + if node.form.lower()[:1] != node.lemma.lower()[:1]: + # Exclude legitimate cases where the lemma starts with a different letter. + hit = True + if node.lemma == 'jo' and re.match("(em|ens|m'|me|mi|nos|nosaltres|'ns)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'tu' and re.match("(et|'t|us|vosaltres|vostè)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'el' and re.match("(la|l|l'|les)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ell' and re.match("(hi|ho|'l|l'|la|-la|les|li|lo|-lo|los|'ls|'s|s'|se|-se|si)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'es' and re.match("(s|se)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'em' and re.match("('m|m|m')", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'en' and re.match("('n|n'|ne|-ne)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'anar' and re.match("(va|van|vàrem)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ser' and re.match("(és|era|eren|eres|érem|essent|estat|ets|foren|fos|fossin|fou)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'estar' and re.match("(sigut)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'caure' and re.match("(queia|queies|quèiem|quèieu|queien)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ampli' and re.match("(àmplia|àmplies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'indi' and re.match("(índies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'obvi' and re.match("(òbvia)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ossi' and re.match("(òssies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ús' and re.match("(usos)", node.form, re.IGNORECASE): + hit = False + # Form = '2001/37/CE', lemma = 'CE' + # Form = 'nº5', lemma = '5' + # Form = 'kg.', lemma = 'quilogram' + # Form = 'un', lemma = '1' + if node.lemma == 'CE' or re.match("nº", node.form, re.IGNORECASE) or re.match("^quil[oò]", node.lemma, re.IGNORECASE) or re.match("^[0-9]+$", node.lemma): + hit = False + if hit: + print("Form = '%s', lemma = '%s', address = %s" % (node.form, node.lemma, node.address())) diff --git a/udapi/block/ud/es/addmwt.py b/udapi/block/ud/es/addmwt.py index ee85b1d6..92f80160 100644 --- a/udapi/block/ud/es/addmwt.py +++ b/udapi/block/ud/es/addmwt.py @@ -1,6 +1,6 @@ """Block ud.es.AddMwt for heuristic detection of Spanish contractions. -According to the UD guidelines, contractions such as "dele" = "de ele" +According to the UD guidelines, contractions such as "del" = "de el" should be annotated using multi-word tokens. Note that this block should be used only for converting legacy conllu files. @@ -28,7 +28,7 @@ v['lemma'] = v['form'] v['upos'] = 'ADP DET' v['deprel'] = '* det' - v['feats'] = '_ *' + v['feats'] = '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art' # The following are the default values # v['main'] = 0 # which of the two words will inherit the original children (if any) # v['shape'] = 'siblings', # the newly created nodes will be siblings @@ -46,6 +46,11 @@ def multiword_analysis(self, node): analysis = MWTS.get(node.form.lower(), None) if analysis is not None: + # Modify the default attachment of the new syntactic words in special situations. + if re.match(r'^(root|conj|reparandum)$', node.udeprel): + # Copy the dictionary so that we do not modify the original and do not affect subsequent usages. + analysis = analysis.copy() + analysis['shape'] = 'subtree' return analysis if not self.verbpron or node.upos not in {'VERB', 'AUX'}: From 27bf3a3b82b4a88ecaeee2bc2843efaf383bc249 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 21:02:48 +0200 Subject: [PATCH 025/871] Archiving my fixes for Indonesian GSD. --- udapi/block/ud/id/__init__.py | 0 udapi/block/ud/id/fixgsd.py | 67 +++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 udapi/block/ud/id/__init__.py create mode 100644 udapi/block/ud/id/fixgsd.py diff --git a/udapi/block/ud/id/__init__.py b/udapi/block/ud/id/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py new file mode 100644 index 00000000..1ab30dd0 --- /dev/null +++ b/udapi/block/ud/id/fixgsd.py @@ -0,0 +1,67 @@ +"""Block to fix annotation of UD Indonesian-GSD.""" +from udapi.core.block import Block +import logging +import re + +class FixGSD(Block): + + def lemmatize_verb_from_morphind(self, node): + # The MISC column contains the output of MorphInd for the current word. + # The analysis has been interpreted wrongly for some verbs, so we need + # to re-interpret it and extract the correct lemma. + if node.upos == "VERB": + morphind = node.misc["MorphInd"] + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r"_VS[AP]$", "", morphind) + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r"\+", morphind) + # Expected suffixes are -kan, -i, -an, or no suffix at all. + if len(morphemes) > 1 and re.match(r"^(kan|i|an)$", morphemes[-1]): + del morphemes[-1] + # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. + # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". + while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se)$", morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s'" % (len(morphemes), morphemes, morphind)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r"<[a-z]+>$", "", lemma) + node.lemma = lemma + + def merge_reduplicated_plural(self, node): + # Instead of compound:plur, merge the reduplicated plurals into a single token. + if node.deprel == "compound:plur": + root = node.root + # We assume that the previous token is a hyphen and the token before it is the parent. + first = node.parent + if first.ord == node.ord-2 and first.form.lower() == node.form.lower(): + hyph = node.prev_node + if hyph.is_descendant_of(first) and re.match(r"^(-|–|--)$", hyph.form): + # Neither the hyphen nor the current node should have children. + # If they do, re-attach the children to the first node. + for c in hyph.children: + c.parent = first + for c in node.children: + c.parent = first + # Merge the three nodes. + first.form = first.form + "-" + node.form + first.feats["Number"] = "Plur" + if node.no_space_after: + first.misc["SpaceAfter"] = "No" + else: + first.misc["SpaceAfter"] = "" + hyph.remove() + node.remove() + # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. + # If it did not, then we have a mismatch with the sentence text, which we must fix. + # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). + root.text = root.compute_text() + + def process_node(self, node): + self.lemmatize_verb_from_morphind(node) From 9cb18121557f553068fc4c17bd5feea16227ef96 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 21:05:35 +0200 Subject: [PATCH 026/871] "per-" is also a prefix that should be removed from the lemma (confirmed by Ika). --- udapi/block/ud/id/fixgsd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 1ab30dd0..a629d712 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -23,7 +23,7 @@ def lemmatize_verb_from_morphind(self, node): del morphemes[-1] # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". - while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se)$", morphemes[0]): + while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]): del morphemes[0] # Check that we are left with just one morpheme. if len(morphemes) != 1: From e8194c4dc6726685d26eee5f9761f2ef1335fc31 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 21:10:26 +0200 Subject: [PATCH 027/871] Extended debugging message. --- udapi/block/ud/id/fixgsd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index a629d712..b3328273 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -27,7 +27,7 @@ def lemmatize_verb_from_morphind(self, node): del morphemes[0] # Check that we are left with just one morpheme. if len(morphemes) != 1: - logging.warning("One morpheme expected, found %d %s, morphind = '%s'" % (len(morphemes), morphemes, morphind)) + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s'" % (len(morphemes), morphemes, morphind, node.form)) else: lemma = morphemes[0] # Remove the stem POS category. From bc0a1d3e969698be93cd9bf8e2bd5a0aa67cad9c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 21:59:46 +0200 Subject: [PATCH 028/871] Indonesian fixes. --- udapi/block/ud/id/addmwt.py | 16 ++++++++++++++++ udapi/block/ud/id/fixgsd.py | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 udapi/block/ud/id/addmwt.py diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py new file mode 100644 index 00000000..3fe39dd8 --- /dev/null +++ b/udapi/block/ud/id/addmwt.py @@ -0,0 +1,16 @@ +""" +Block ud.id.AddMwt cuts the clitic "-nya" in Indonesian (preprocessed with +MorphInd whose output is stored in MISC attribute MorphInd). +""" +import udapi.block.ud.addmwt + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE): + splitform = re.sub(r'(nya)$', r' \1', re.IGNORECASE) + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': splitform, 'upos': 'VERB PRON', 'shape': 'subtree', 'deprel': '* obj'} + return None diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index b3328273..0ec79f2e 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -27,7 +27,7 @@ def lemmatize_verb_from_morphind(self, node): del morphemes[0] # Check that we are left with just one morpheme. if len(morphemes) != 1: - logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s'" % (len(morphemes), morphemes, morphind, node.form)) + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) else: lemma = morphemes[0] # Remove the stem POS category. From 70361e213d59709d0793992b01476bf5373a6e46 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 22:07:29 +0200 Subject: [PATCH 029/871] Features of -nya. --- udapi/block/ud/id/addmwt.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 3fe39dd8..2d46fb67 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -11,6 +11,15 @@ def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE): splitform = re.sub(r'(nya)$', r' \1', re.IGNORECASE) + # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. + if node.feats["Number[psor]"] != "Sing": + logging.warning("Verb '%s' has Number[psor]=='%s'" % (node.form, node.feats["Number[psor]"])) + if node.feats["Person[psor]"] != "3": + logging.warning("Verb '%s' has Person[psor]=='%s'" % (node.form, node.feats["Person[psor]"])) + node.feats["Number[psor]"] = '' + node.feats["Person[psor]"] = '' + pronfeats = 'Number=Sing|Person=3|PronType=Prs' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) - return {'form': splitform, 'lemma': splitform, 'upos': 'VERB PRON', 'shape': 'subtree', 'deprel': '* obj'} + return {'form': splitform, 'lemma': splitform, 'upos': 'VERB PRON', 'feats': '* '+pronfeats, 'shape': 'subtree', 'deprel': '* obj'} return None From 87909af8a5ff6e4d19fcdd4640babe1f502e260e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 22:09:25 +0200 Subject: [PATCH 030/871] Bug fix. --- udapi/block/ud/id/addmwt.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 2d46fb67..4a2158c4 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -3,6 +3,8 @@ MorphInd whose output is stored in MISC attribute MorphInd). """ import udapi.block.ud.addmwt +import logging +import re class AddMwt(udapi.block.ud.addmwt.AddMwt): """Detect and mark MWTs (split them into words and add the words to the tree).""" From 0a5148670f4133505d9e0524b41cedcbec923713 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 22:13:05 +0200 Subject: [PATCH 031/871] Bug fix. --- udapi/block/ud/id/addmwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 4a2158c4..da5219f0 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -12,7 +12,7 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE): - splitform = re.sub(r'(nya)$', r' \1', re.IGNORECASE) + splitform = re.sub(r'(nya)$', r' \1', flags=re.IGNORECASE) # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. if node.feats["Number[psor]"] != "Sing": From 141f100c71ae5ec33c3e126841d63234e3c16f1a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 22:14:09 +0200 Subject: [PATCH 032/871] Bug fix. --- udapi/block/ud/id/addmwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index da5219f0..c0c40486 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -12,7 +12,7 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE): - splitform = re.sub(r'(nya)$', r' \1', flags=re.IGNORECASE) + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. if node.feats["Number[psor]"] != "Sing": From da307863288f6e9872b2f6f6ace9555f1763ecf4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 22:21:20 +0200 Subject: [PATCH 033/871] Narrowing down the conditions for -nya. --- udapi/block/ud/id/addmwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index c0c40486..023f10fa 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -11,7 +11,7 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" - if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE): + if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE) and re.search(r'\+dia

_PS3$', node.misc["MorphInd"]): splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. From 0d54d60ba290b9bf3cbb0d245fed8eda923ff154 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 22:52:41 +0200 Subject: [PATCH 034/871] More refined -nya segmentation. --- udapi/block/ud/id/addmwt.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 023f10fa..6231c7e6 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -11,17 +11,26 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" - if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE) and re.search(r'\+dia

_PS3$', node.misc["MorphInd"]): + if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE) and re.search(r'\+dia

_PS3\$$', node.misc['MorphInd']): splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. - if node.feats["Number[psor]"] != "Sing": - logging.warning("Verb '%s' has Number[psor]=='%s'" % (node.form, node.feats["Number[psor]"])) - if node.feats["Person[psor]"] != "3": - logging.warning("Verb '%s' has Person[psor]=='%s'" % (node.form, node.feats["Person[psor]"])) - node.feats["Number[psor]"] = '' - node.feats["Person[psor]"] = '' + if node.feats['Number[psor]'] != 'Sing': + logging.warning("Verb '%s' has Number[psor]=='%s'" % (node.form, node.feats['Number[psor]'])) + if node.feats['Person[psor]'] != '3': + logging.warning("Verb '%s' has Person[psor]=='%s'" % (node.form, node.feats['Person[psor]'])) + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' pronfeats = 'Number=Sing|Person=3|PronType=Prs' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = 'obl:agent' if re.match(r'^\^di\+', node.misc['MorphInd']) else 'obj' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) - return {'form': splitform, 'lemma': splitform, 'upos': 'VERB PRON', 'feats': '* '+pronfeats, 'shape': 'subtree', 'deprel': '* obj'} + return {'form': splitform, 'lemma': splitform, 'upos': 'VERB PRON', 'feats': '* '+pronfeats, 'xpos': xpos, 'shape': 'subtree', 'deprel': '* '+deprel} return None + + def postprocess_mwt(self, mwt): + """Distribute the MorphInd analysis to the two parts so that we can later use it to fix the lemmas of verbs.""" + match = re.match(r'^\^(.*)\+(dia

_PS3)\$$', mwt.misc['MorphInd']) + if match: + mwt.words[0].misc['MorphInd'] = '^'+match.group(1)+'$' + mwt.words[1].misc['MorphInd'] = '^'+match.group(2)+'$' From 1eaaebdb7311089921161e2045b407a95387d442 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 12 Sep 2021 10:46:41 +0200 Subject: [PATCH 035/871] Indonesian proper nouns do not form plural. --- udapi/block/ud/id/fixgsd.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 0ec79f2e..552d6743 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -63,5 +63,17 @@ def merge_reduplicated_plural(self, node): # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). root.text = root.compute_text() + def fix_plural_propn(self, node): + """ + It is unlikely that a proper noun will have a plural form in Indonesian. + All examples observed in GSD should actually be tagged as common nouns. + """ + if node.upos == 'PROPN' and node.feats['Number'] == 'Plur': + node.upos = 'NOUN' + node.lemma = node.lemma.lower() + if node.upos == 'PROPN': + node.feats['Number'] = '' + def process_node(self, node): - self.lemmatize_verb_from_morphind(node) + self.fix_plural_propn(node) + #self.lemmatize_verb_from_morphind(node) From 3db7f05bf8075e81e6c274db0d22b8d8de5b4eef Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 12 Sep 2021 12:34:43 +0200 Subject: [PATCH 036/871] Modified splitting of VERB+nya after some more input from Ika. --- udapi/block/ud/id/addmwt.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 6231c7e6..cc3a0fee 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -13,6 +13,13 @@ def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE) and re.search(r'\+dia

_PS3\$$', node.misc['MorphInd']): splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + # For transitive verbs with the meN- prefix, -nya is an object clitic. + # For passive verbs with the di- prefix, -nya refers to a passive agent. + # For verbs with prefixes ber-, ter-, and verbs without prefixes, -nya is a definite article and signals nominalization. + # The same would hold for intransitive verbs with the meN- prefix but we cannot recognize them (we will treat all meN- verbs as transitive). + menverb = True if re.match(r'^\^meN\+', node.misc['MorphInd']) else False + diverb = True if re.match(r'^\^di\+', node.misc['MorphInd']) else False + nominalization = not menverb and not diverb # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. if node.feats['Number[psor]'] != 'Sing': @@ -21,11 +28,19 @@ def multiword_analysis(self, node): logging.warning("Verb '%s' has Person[psor]=='%s'" % (node.form, node.feats['Person[psor]'])) node.feats['Number[psor]'] = '' node.feats['Person[psor]'] = '' - pronfeats = 'Number=Sing|Person=3|PronType=Prs' + if nominalization: + lemma = splitform.lower() + upos = 'VERB DET' + feats = '* Definite=Def|PronType=Art' + deprel = '* det' + else: + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + upos = 'VERB PRON' + feats = '* Number=Sing|Person=3|PronType=Prs' + deprel = '* obj:agent' if diverb else '* obj' xpos = re.sub(r'\+', ' ', node.xpos) - deprel = 'obl:agent' if re.match(r'^\^di\+', node.misc['MorphInd']) else 'obj' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) - return {'form': splitform, 'lemma': splitform, 'upos': 'VERB PRON', 'feats': '* '+pronfeats, 'xpos': xpos, 'shape': 'subtree', 'deprel': '* '+deprel} + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} return None def postprocess_mwt(self, mwt): From ad0f1340276237b02ed2e0423147cbdb5b11a9b9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 12 Sep 2021 14:29:27 +0200 Subject: [PATCH 037/871] Indonesian -nya with nouns. --- udapi/block/ud/id/addmwt.py | 75 +++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 28 deletions(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index cc3a0fee..e22f27bd 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -11,36 +11,55 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" - if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE) and re.search(r'\+dia

_PS3\$$', node.misc['MorphInd']): - splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) - # For transitive verbs with the meN- prefix, -nya is an object clitic. - # For passive verbs with the di- prefix, -nya refers to a passive agent. - # For verbs with prefixes ber-, ter-, and verbs without prefixes, -nya is a definite article and signals nominalization. - # The same would hold for intransitive verbs with the meN- prefix but we cannot recognize them (we will treat all meN- verbs as transitive). - menverb = True if re.match(r'^\^meN\+', node.misc['MorphInd']) else False - diverb = True if re.match(r'^\^di\+', node.misc['MorphInd']) else False - nominalization = not menverb and not diverb - # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. - # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. - if node.feats['Number[psor]'] != 'Sing': - logging.warning("Verb '%s' has Number[psor]=='%s'" % (node.form, node.feats['Number[psor]'])) - if node.feats['Person[psor]'] != '3': - logging.warning("Verb '%s' has Person[psor]=='%s'" % (node.form, node.feats['Person[psor]'])) - node.feats['Number[psor]'] = '' - node.feats['Person[psor]'] = '' - if nominalization: - lemma = splitform.lower() - upos = 'VERB DET' - feats = '* Definite=Def|PronType=Art' - deprel = '* det' - else: + if re.search(r'nya$', node.form, re.IGNORECASE) and re.search(r'\+dia

_PS3\$$', node.misc['MorphInd']): + if node.upos == 'VERB': + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + # For transitive verbs with the meN- prefix, -nya is an object clitic. + # For passive verbs with the di- prefix, -nya refers to a passive agent. + # For verbs with prefixes ber-, ter-, and verbs without prefixes, -nya is a definite article and signals nominalization. + # The same would hold for intransitive verbs with the meN- prefix but we cannot recognize them (we will treat all meN- verbs as transitive). + menverb = True if re.match(r'^\^meN\+', node.misc['MorphInd']) else False + diverb = True if re.match(r'^\^di\+', node.misc['MorphInd']) else False + nominalization = not menverb and not diverb + # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. + if node.feats['Number[psor]'] != 'Sing': + logging.warning("Verb '%s' has Number[psor]=='%s'" % (node.form, node.feats['Number[psor]'])) + if node.feats['Person[psor]'] != '3': + logging.warning("Verb '%s' has Person[psor]=='%s'" % (node.form, node.feats['Person[psor]'])) + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + if nominalization: + lemma = splitform.lower() + upos = 'VERB DET' + feats = '* Definite=Def|PronType=Art' + deprel = '* det' + else: + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + upos = 'VERB PRON' + feats = '* Number=Sing|Person=3|PronType=Prs' + deprel = '* obj:agent' if diverb else '* obj' + xpos = re.sub(r'\+', ' ', node.xpos) + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'(NOUN|PROPN|X)', node.upos): + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. + if node.feats['Number[psor]'] != 'Sing': + logging.warning("Noun '%s' has Number[psor]=='%s'" % (node.form, node.feats['Number[psor]'])) + if node.feats['Person[psor]'] != '3': + logging.warning("Noun '%s' has Person[psor]=='%s'" % (node.form, node.feats['Person[psor]'])) + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' lemma = re.sub(r' nya$', ' dia', splitform.lower()) - upos = 'VERB PRON' + upos = '* PRON' feats = '* Number=Sing|Person=3|PronType=Prs' - deprel = '* obj:agent' if diverb else '* obj' - xpos = re.sub(r'\+', ' ', node.xpos) - # 'main': 0 ... this is the default value (the first node will be the head and inherit children) - return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* nmod:poss' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + return None return None def postprocess_mwt(self, mwt): From 62b8f93d35142e36722c2fc5ed5dca1073357321 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 12 Sep 2021 14:40:31 +0200 Subject: [PATCH 038/871] Warn about unhandled instances of -nya. --- udapi/block/ud/id/addmwt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index e22f27bd..40421611 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -59,7 +59,9 @@ def multiword_analysis(self, node): deprel = '* nmod:poss' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} - return None + else: + logging.warning("Form '%s' analyzed by MorphInd as having the -nya clitic but the UPOS is '%s'" % (node.form, node.upos)) + return None return None def postprocess_mwt(self, mwt): From f5dde932ee8348abd01078123eedf5c46f7cb4dc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 12 Sep 2021 14:56:03 +0200 Subject: [PATCH 039/871] Indonesian verb lemmatization turned on. --- udapi/block/ud/id/fixgsd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 552d6743..458e41db 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -31,7 +31,7 @@ def lemmatize_verb_from_morphind(self, node): else: lemma = morphemes[0] # Remove the stem POS category. - lemma = re.sub(r"<[a-z]+>$", "", lemma) + lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma) node.lemma = lemma def merge_reduplicated_plural(self, node): @@ -76,4 +76,4 @@ def fix_plural_propn(self, node): def process_node(self, node): self.fix_plural_propn(node) - #self.lemmatize_verb_from_morphind(node) + self.lemmatize_verb_from_morphind(node) From c351c00c12e75ce7cfb802599e3c02258ecb2fc0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 12 Sep 2021 22:21:54 +0200 Subject: [PATCH 040/871] Why we attach -nya as obl:agent. --- udapi/block/ud/id/addmwt.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 40421611..1270ba77 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -38,7 +38,13 @@ def multiword_analysis(self, node): lemma = re.sub(r' nya$', ' dia', splitform.lower()) upos = 'VERB PRON' feats = '* Number=Sing|Person=3|PronType=Prs' - deprel = '* obj:agent' if diverb else '* obj' + # The agent of the passive verb is coded like a direct object of an active verb, + # so we might want to use obj:agent rather than obl:agent. However, full nominals + # as passive agents can be optionally accompanied by the preposition _oleh_ "by", + # which is an argument in favor of saying that they are oblique. So we currently + # mark all passive agents as obliques, although it is disputable in Austronesian + # languages (unlike Indo-European passives). + deprel = '* obl:agent' if diverb else '* obj' xpos = re.sub(r'\+', ' ', node.xpos) # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} From ee04cdea67d1edb64d0f9573103818557204eb81 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 12 Sep 2021 22:52:40 +0200 Subject: [PATCH 041/871] ke-sama-an --- udapi/block/ud/id/fixgsd.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 458e41db..481630fa 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -19,7 +19,10 @@ def lemmatize_verb_from_morphind(self, node): # Split morphind to prefix, stem, and suffix. morphemes = re.split(r"\+", morphind) # Expected suffixes are -kan, -i, -an, or no suffix at all. - if len(morphemes) > 1 and re.match(r"^(kan|i|an)$", morphemes[-1]): + # There is also the circumfix ke-...-an which seems to be nominalized adjective: + # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama"; + # but I am not sure what is the reason that these are tagged VERB. + if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]): del morphemes[-1] # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". From 5659d820afe8ff2dfabf1785a4fdd3e94ab58a3a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Sep 2021 16:34:22 +0200 Subject: [PATCH 042/871] New fixes for Indonesian. --- udapi/block/ud/id/addmwt.py | 73 ++++++++++++++++++++++++++++++++++--- udapi/block/ud/id/fixgsd.py | 48 ++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 1270ba77..7e8db2f0 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -52,10 +52,6 @@ def multiword_analysis(self, node): splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. - if node.feats['Number[psor]'] != 'Sing': - logging.warning("Noun '%s' has Number[psor]=='%s'" % (node.form, node.feats['Number[psor]'])) - if node.feats['Person[psor]'] != '3': - logging.warning("Noun '%s' has Person[psor]=='%s'" % (node.form, node.feats['Person[psor]'])) node.feats['Number[psor]'] = '' node.feats['Person[psor]'] = '' lemma = re.sub(r' nya$', ' dia', splitform.lower()) @@ -65,8 +61,75 @@ def multiword_analysis(self, node): deprel = '* nmod:poss' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'ADJ': + # nominalized adjective + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'ADJ DET' + feats = '* Definite=Def|PronType=Art' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'^(banyak|semua)nya$', node.form, re.IGNORECASE): + # semua = all (DET) + # semuanya = nominalization of semua, i.e., 'everything' (PRON) + # banyak = many, much (DET) + # banyaknya = nominalization of banyak, i.e., 'a lot' (PRON) + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'DET DET' + feats = ('PronType=Tot' if lemma == 'semua nya' else 'PronType=Ind')+' Definite=Def|PronType=Art' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'^(satu)nya$', node.form, re.IGNORECASE): + # satu = one (NUM) + # satunya = nominalization of satu, meaning 'the only one' + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'NUM DET' + feats = 'NumType=Card Definite=Def|PronType=Art' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'ADP' and node.xpos == 'R--+PS3' or re.match(r'^(bersama|dibawah|didalam|sekitar)nya$', node.form, re.IGNORECASE): + # Fused preposition and pronoun. + # Most of them are recognized as R--+PS3 by MorphInd. However, some are different: + # bersamanya = 'with him' = VSA+PS3 + # dibawahnya = 'under it' = VSP+PS3 + # didalamnya = 'inside it' = VSP+PS3 + # sekitarnya = 'around it' = D--+PS3 + # However: + # layaknya = 'like' is a derivation from 'layak' = 'worthy' (ASP+PS3) + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + upos = 'ADP PRON' + feats = '_ Number=Sing|Person=3|PronType=Prs' + xpos = 'R-- PS3' + if node.udeprel == 'case': + if re.match(r'^(NOUN|PROPN|PRON|DET|NUM|X|SYM)$', node.parent.upos): + deprel = 'nmod' + else: + deprel = 'obl' + else: + deprel = '*' + deprel = 'case '+deprel + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'main': 1, 'shape': 'subtree', 'deprel': deprel} else: - logging.warning("Form '%s' analyzed by MorphInd as having the -nya clitic but the UPOS is '%s'" % (node.form, node.upos)) + # Do not warn about instances that are known exceptions. + # akibatnya = as a result (SCONJ); akibat = result + # bukannya = instead (PART); bukan = no, not + # dirinya = reflexive himself/herself/itself (similarly, diriku = myself, dirimu = yourself; somewhere else we should check that they have the right features) + # layaknya = like (ADP); layak = worthy + # sebaiknya = should (AUX) + # sesampainya = once in / arriving at (ADP) + # tidaknya = whether or not (PART); tidak = no, not + # Adverbs are an exception, too. The -nya morpheme could be derivation. E.g., 'ironis' = 'ironic'; 'ironisnya' = 'ironically'. + if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|diri|layak|sebaik|sesampai|tidak)nya$', node.form, re.IGNORECASE): + logging.warning("Form '%s' analyzed by MorphInd as having the -nya clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos)) return None return None diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 481630fa..fce6e4f9 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -5,6 +5,52 @@ class FixGSD(Block): + def fix_upos_based_on_morphind(self, node): + """ + Example from data: ("kesamaan"), the correct UPOS is NOUN, as + suggested by MorphInd. + Based on my observation so far, if there is a different UPOS between + the original GSD and MorphInd, it's better to trust MorphInd + I found so many incorrect UPOS in GSD, especially when NOUNs become + VERBs and VERBs become NOUNs. + I suggest adding Voice=Pass when the script decides ke-xxx-an as VERB. + """ + if node.upos == 'VERB' and node.xpos == 'NSD' and re.match(r'^ke.+an$', node.form, re.IGNORECASE): + node.upos = 'NOUN' + if node.udeprel == 'acl': + node.deprel = 'nmod' + elif node.udeprel == 'advcl': + node.deprel = 'obl' + + def fix_ordinal_numerals(self, node): + """ + Ordinal numerals should be ADJ NumType=Ord in UD. They have many different + UPOS tags in Indonesian GSD. This method harmonizes them. + pertama = first + kedua = second + ketiga = third + keempat = fourth + kelima = fifth + keenam = sixth + ketujuh = seventh + kedelapan = eighth + kesembilan = ninth + ke48 = 48th + """ + # We could also check the XPOS, which is derived from MorphInd: re.match(r'^CO-', node.xpos) + if re.match(r'^(pertama|kedua|ketiga|keempat|kelima|keenam|ketujuh|kedelapan|kesembilan|ke-?\d+)(nya)?$', node.form, re.IGNORECASE): + node.upos = 'ADJ' + node.feats['NumType'] = 'Ord' + if re.match(r'^(det|nummod|nmod)$', node.udeprel): + node.deprel = 'amod' + # The following is not an ordinal numeral but I am too lazy to create a separate method for that. + elif node.form.lower() == 'semua': + # It means 'all'. Originally it was DET, PRON, or ADV. + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + if node.udeprel == 'nmod' or node.udeprel == 'advmod': + node.deprel = 'det' + def lemmatize_verb_from_morphind(self, node): # The MISC column contains the output of MorphInd for the current word. # The analysis has been interpreted wrongly for some verbs, so we need @@ -79,4 +125,6 @@ def fix_plural_propn(self, node): def process_node(self, node): self.fix_plural_propn(node) + self.fix_upos_based_on_morphind(node) + self.fix_ordinal_numerals(node) self.lemmatize_verb_from_morphind(node) From f611e3e74192cef526d7ef702faf3f6c50a7838b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Sep 2021 17:55:40 +0200 Subject: [PATCH 043/871] Splitting -kah, -lah, -pun, -tah in Indonesian. --- udapi/block/ud/id/addmwt.py | 13 +++++++++- udapi/block/ud/id/fixgsd.py | 51 ++++++++++++++++++++----------------- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 7e8db2f0..65aed6dc 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -131,11 +131,22 @@ def multiword_analysis(self, node): if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|diri|layak|sebaik|sesampai|tidak)nya$', node.form, re.IGNORECASE): logging.warning("Form '%s' analyzed by MorphInd as having the -nya clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos)) return None + elif re.search(r'(kah|lah|pun|tah)$', node.form, re.IGNORECASE) and re.search(r'\+(kah|lah|pun|tah)_T--\$$', node.misc['MorphInd']): + splitform = re.sub(r'(kah|lah|pun|tah)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = '* PART' + feats = '* _' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split()) < 2: + xpos = xpos + ' T--' + deprel = '* advmod:emph' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} return None def postprocess_mwt(self, mwt): """Distribute the MorphInd analysis to the two parts so that we can later use it to fix the lemmas of verbs.""" - match = re.match(r'^\^(.*)\+(dia

_PS3)\$$', mwt.misc['MorphInd']) + match = re.match(r'^\^(.*)\+(dia

_PS3|kah_T--|lah_T--|pun_T--|tah_T--)\$$', mwt.misc['MorphInd']) if match: mwt.words[0].misc['MorphInd'] = '^'+match.group(1)+'$' mwt.words[1].misc['MorphInd'] = '^'+match.group(2)+'$' diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index fce6e4f9..4ea96968 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -57,31 +57,34 @@ def lemmatize_verb_from_morphind(self, node): # to re-interpret it and extract the correct lemma. if node.upos == "VERB": morphind = node.misc["MorphInd"] - # Remove the start and end tags from morphind. - morphind = re.sub(r"^\^", "", morphind) - morphind = re.sub(r"\$$", "", morphind) - # Remove the final XPOS tag from morphind. - morphind = re.sub(r"_VS[AP]$", "", morphind) - # Split morphind to prefix, stem, and suffix. - morphemes = re.split(r"\+", morphind) - # Expected suffixes are -kan, -i, -an, or no suffix at all. - # There is also the circumfix ke-...-an which seems to be nominalized adjective: - # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama"; - # but I am not sure what is the reason that these are tagged VERB. - if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]): - del morphemes[-1] - # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. - # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". - while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]): - del morphemes[0] - # Check that we are left with just one morpheme. - if len(morphemes) != 1: - logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r"_VS[AP]$", "", morphind) + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r"\+", morphind) + # Expected suffixes are -kan, -i, -an, or no suffix at all. + # There is also the circumfix ke-...-an which seems to be nominalized adjective: + # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama"; + # but I am not sure what is the reason that these are tagged VERB. + if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]): + del morphemes[-1] + # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. + # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". + while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma) + node.lemma = lemma else: - lemma = morphemes[0] - # Remove the stem POS category. - lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma) - node.lemma = lemma + logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) def merge_reduplicated_plural(self, node): # Instead of compound:plur, merge the reduplicated plurals into a single token. From 9dc01a1620237e97d446782714be27343129a942 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Sep 2021 21:47:21 +0200 Subject: [PATCH 044/871] More Indonesian clitics. --- udapi/block/ud/id/addmwt.py | 82 ++++++++++++++++++++++++++++--------- udapi/block/ud/id/fixgsd.py | 2 +- 2 files changed, 63 insertions(+), 21 deletions(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 65aed6dc..7f5ab271 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -11,9 +11,30 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" - if re.search(r'nya$', node.form, re.IGNORECASE) and re.search(r'\+dia

_PS3\$$', node.misc['MorphInd']): + if re.search(r'^(ku|kau)', node.form, re.IGNORECASE) and re.search(r'^\^(aku

_PS1|kamu

_PS2)\+', node.misc['MorphInd']) and node.upos == 'VERB': + splitform = re.sub(r'^(ku|kau)', r'\1 ', node.form, flags=re.IGNORECASE) + # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + upos = 'PRON VERB' + if re.search(r'^ku ', splitform.lower()): + lemma = re.sub(r'^ku ', 'aku ', splitform.lower()) + feats = 'Number=Sing|Person=1|PronType=Prs *' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split())<2: + xpos = 'PS1 VSA' + else: + lemma = re.sub(r'^kau ', 'kamu ', splitform.lower()) + feats = 'Number=Sing|Person=2|PronType=Prs *' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split())<2: + xpos = 'PS2 VSA' + deprel = 'nsubj *' + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'main': 1, 'shape': 'subtree', 'deprel': deprel} + elif re.search(r'(nya|ku|mu)$', node.form, re.IGNORECASE) and re.search(r'\+(dia

_PS3|aku

_PS1|kamu

_PS2)\$$', node.misc['MorphInd']): if node.upos == 'VERB': - splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) # For transitive verbs with the meN- prefix, -nya is an object clitic. # For passive verbs with the di- prefix, -nya refers to a passive agent. # For verbs with prefixes ber-, ter-, and verbs without prefixes, -nya is a definite article and signals nominalization. @@ -23,10 +44,6 @@ def multiword_analysis(self, node): nominalization = not menverb and not diverb # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. - if node.feats['Number[psor]'] != 'Sing': - logging.warning("Verb '%s' has Number[psor]=='%s'" % (node.form, node.feats['Number[psor]'])) - if node.feats['Person[psor]'] != '3': - logging.warning("Verb '%s' has Person[psor]=='%s'" % (node.form, node.feats['Person[psor]'])) node.feats['Number[psor]'] = '' node.feats['Person[psor]'] = '' if nominalization: @@ -35,9 +52,16 @@ def multiword_analysis(self, node): feats = '* Definite=Def|PronType=Art' deprel = '* det' else: - lemma = re.sub(r' nya$', ' dia', splitform.lower()) upos = 'VERB PRON' - feats = '* Number=Sing|Person=3|PronType=Prs' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' # The agent of the passive verb is coded like a direct object of an active verb, # so we might want to use obj:agent rather than obl:agent. However, full nominals # as passive agents can be optionally accompanied by the preposition _oleh_ "by", @@ -49,19 +73,26 @@ def multiword_analysis(self, node): # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} elif re.match(r'(NOUN|PROPN|X)', node.upos): - splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. node.feats['Number[psor]'] = '' node.feats['Person[psor]'] = '' - lemma = re.sub(r' nya$', ' dia', splitform.lower()) upos = '* PRON' - feats = '* Number=Sing|Person=3|PronType=Prs' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' xpos = re.sub(r'\+', ' ', node.xpos) deprel = '* nmod:poss' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} - elif node.upos == 'ADJ': + elif node.upos == 'ADJ' and re.search(r'(nya)$', node.form, re.IGNORECASE): # nominalized adjective splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) lemma = splitform.lower() @@ -95,7 +126,7 @@ def multiword_analysis(self, node): deprel = '* det' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} - elif node.upos == 'ADP' and node.xpos == 'R--+PS3' or re.match(r'^(bersama|dibawah|didalam|sekitar)nya$', node.form, re.IGNORECASE): + elif node.upos == 'ADP' and re.match(r'^R--\+PS[123]$', node.xpos) or re.match(r'^(bersama|dibawah|didalam|sekitar)nya$', node.form, re.IGNORECASE): # Fused preposition and pronoun. # Most of them are recognized as R--+PS3 by MorphInd. However, some are different: # bersamanya = 'with him' = VSA+PS3 @@ -104,11 +135,20 @@ def multiword_analysis(self, node): # sekitarnya = 'around it' = D--+PS3 # However: # layaknya = 'like' is a derivation from 'layak' = 'worthy' (ASP+PS3) - splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) - lemma = re.sub(r' nya$', ' dia', splitform.lower()) + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) upos = 'ADP PRON' - feats = '_ Number=Sing|Person=3|PronType=Prs' - xpos = 'R-- PS3' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + xpos = 'R-- PS3' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + xpos = 'R-- PS1' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' + xpos = 'R-- PS2' if node.udeprel == 'case': if re.match(r'^(NOUN|PROPN|PRON|DET|NUM|X|SYM)$', node.parent.upos): deprel = 'nmod' @@ -128,8 +168,8 @@ def multiword_analysis(self, node): # sesampainya = once in / arriving at (ADP) # tidaknya = whether or not (PART); tidak = no, not # Adverbs are an exception, too. The -nya morpheme could be derivation. E.g., 'ironis' = 'ironic'; 'ironisnya' = 'ironically'. - if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|diri|layak|sebaik|sesampai|tidak)nya$', node.form, re.IGNORECASE): - logging.warning("Form '%s' analyzed by MorphInd as having the -nya clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos)) + if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|diri|layak|sebaik|sesampai|tidak)(nya|ku|mu)$', node.form, re.IGNORECASE): + logging.warning("Form '%s' analyzed by MorphInd as having the -nya|-ku|-mu clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos)) return None elif re.search(r'(kah|lah|pun|tah)$', node.form, re.IGNORECASE) and re.search(r'\+(kah|lah|pun|tah)_T--\$$', node.misc['MorphInd']): splitform = re.sub(r'(kah|lah|pun|tah)$', r' \1', node.form, flags=re.IGNORECASE) @@ -146,7 +186,9 @@ def multiword_analysis(self, node): def postprocess_mwt(self, mwt): """Distribute the MorphInd analysis to the two parts so that we can later use it to fix the lemmas of verbs.""" - match = re.match(r'^\^(.*)\+(dia

_PS3|kah_T--|lah_T--|pun_T--|tah_T--)\$$', mwt.misc['MorphInd']) + match = re.match(r'^\^(.*)\+(aku

_PS1|kamu

_PS2|dia

_PS3|kah_T--|lah_T--|pun_T--|tah_T--)\$$', mwt.misc['MorphInd']) + if not match: + match = re.match(r'^\^(aku

_PS1|kamu

_PS2)\+(.*)\$$', mwt.misc['MorphInd']) if match: mwt.words[0].misc['MorphInd'] = '^'+match.group(1)+'$' mwt.words[1].misc['MorphInd'] = '^'+match.group(2)+'$' diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 4ea96968..926bc346 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -62,7 +62,7 @@ def lemmatize_verb_from_morphind(self, node): morphind = re.sub(r"^\^", "", morphind) morphind = re.sub(r"\$$", "", morphind) # Remove the final XPOS tag from morphind. - morphind = re.sub(r"_VS[AP]$", "", morphind) + morphind = re.sub(r"_V[SP][AP]$", "", morphind) # Split morphind to prefix, stem, and suffix. morphemes = re.split(r"\+", morphind) # Expected suffixes are -kan, -i, -an, or no suffix at all. From 1a95f38523b7fa3e30a9c868269f9330a740237a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Sep 2021 23:12:24 +0200 Subject: [PATCH 045/871] More Indonesian fixes. --- udapi/block/ud/id/fixgsd.py | 43 +++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 926bc346..d1c735a4 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -126,8 +126,51 @@ def fix_plural_propn(self, node): if node.upos == 'PROPN': node.feats['Number'] = '' + def fix_satu_satunya(self, node): + """ + 'satu' = 'one' (NUM) + 'satu-satunya' = 'the only' + """ + root = node.root + if node.form == 'nya' and node.parent.form.lower() == 'satu' and node.parent.udeprel == 'fixed' and node.parent.parent.form.lower() == 'satu': + satu0 = node.parent.parent + satu1 = node.parent + nya = node + dash = None + if satu1.ord == satu0.ord+2 and satu1.prev_node.form == '-': + dash = satu1.prev_node + satu0.misc['SpaceAfter'] = 'No' + dash.misc['SpaceAfter'] = 'No' + root.text = root.compute_text() + satu1.deprel = 'compound:redup' + nya.parent = satu0 + # We actually cannot leave the 'compound:redup' here because it is not used in Indonesian. + if node.form == 'nya' and node.parent.form.lower() == 'satu': + satu0 = node.parent + nya = node + if satu0.next_node.form == '-': + dash = satu0.next_node + if dash.next_node.form.lower() == 'satu': + satu1 = dash.next_node + if satu1.ord == node.ord-1: + # Merge satu0 + dash + satu1 into one node. + satu0.form = satu0.form + dash.form + satu1.form + dash.remove() + satu1.remove() + # There should be a multi-word token comprising satu1 + nya. + mwt = nya.multiword_token + if mwt: + mwtmisc = mwt.misc.copy() + mwt.remove() + mwt = root.create_multiword_token([satu0, nya], satu0.form + nya.form, mwtmisc) + satu0.misc['SpaceAfter'] = '' + root.text = root.compute_text() + if node.multiword_token and node.no_space_after: + node.misc['SpaceAfter'] = '' + def process_node(self, node): self.fix_plural_propn(node) self.fix_upos_based_on_morphind(node) self.fix_ordinal_numerals(node) self.lemmatize_verb_from_morphind(node) + self.fix_satu_satunya(node) From bfada9deeb004a60438bb0b9562270f051877d3c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 19 Sep 2021 22:40:57 +0200 Subject: [PATCH 046/871] Distinguishing Indonesian ordinal numerals from total cardinal numerals. --- udapi/block/ud/id/fixgsd.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index d1c735a4..6247cba0 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -36,13 +36,32 @@ def fix_ordinal_numerals(self, node): kedelapan = eighth kesembilan = ninth ke48 = 48th + + However! The ke- forms (i.e., not 'pertama') can also function as total + versions of cardinal numbers ('both', 'all three' etc.). If the numeral + precedes the noun, it is a total cardinal; if it follows the noun, it is + an ordinal. An exception is when the modified noun is 'kali' = 'time'. + Then the numeral is ordinal regardless where it occurs, and together + with 'kali' it functions as an adverbial ordinal ('for the second time'). """ # We could also check the XPOS, which is derived from MorphInd: re.match(r'^CO-', node.xpos) - if re.match(r'^(pertama|kedua|ketiga|keempat|kelima|keenam|ketujuh|kedelapan|kesembilan|ke-?\d+)(nya)?$', node.form, re.IGNORECASE): + if re.match(r'^pertama(nya)?$', node.form, re.IGNORECASE): node.upos = 'ADJ' node.feats['NumType'] = 'Ord' if re.match(r'^(det|nummod|nmod)$', node.udeprel): node.deprel = 'amod' + elif re.match(r'^(kedua|ketiga|keempat|kelima|keenam|ketujuh|kedelapan|kesembilan|ke-?\d+)(nya)?$', node.form, re.IGNORECASE): + if node.parent.ord < node.ord or node.parent.lemma == 'kali': + node.upos = 'ADJ' + node.feats['NumType'] = 'Ord' + if re.match(r'^(det|nummod|nmod)$', node.udeprel): + node.deprel = 'amod' + else: + node.upos = 'NUM' + node.feats['NumType'] = 'Card' + node.feats['PronType'] = 'Tot' + if re.match(r'^(det|amod|nmod)$', node.udeprel): + node.deprel = 'nummod' # The following is not an ordinal numeral but I am too lazy to create a separate method for that. elif node.form.lower() == 'semua': # It means 'all'. Originally it was DET, PRON, or ADV. From 12aa517448a8b5fa79af1e115f41dc8d867acbe6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 20 Sep 2021 10:53:11 +0200 Subject: [PATCH 047/871] Fix tokenization of ordinal numerals in Indonesian. --- udapi/block/ud/id/fixgsd.py | 51 +++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 6247cba0..2296fc31 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -70,6 +70,56 @@ def fix_ordinal_numerals(self, node): if node.udeprel == 'nmod' or node.udeprel == 'advmod': node.deprel = 'det' + def rejoin_ordinal_numerals(self, node): + """ + If an ordinal numeral is spelled using digits ('ke-18'), it is often + tokenized as multiple tokens, which is wrong. Fix it. + """ + if node.form.lower() == 'ke': + dash = None + number = None + if node.next_node: + if node.next_node.form == '-': + dash = node.next_node + if dash.next_node and re.match(r'^\d+$', dash.next_node.form): + number = dash.next_node + node.form = node.form + dash.form + number.form + node.lemma = node.lemma + dash.lemma + number.lemma + elif re.match(r'^\d+$', node.next_node.form) and (node.parent == node.next_node or node.next_node.parent == node): + number = node.next_node + node.feats['Typo'] = 'Yes' + node.misc['CorrectForm'] = node.form + '-' + number.form + node.form = node.form + number.form + node.lemma = node.lemma + '-' + number.lemma + if number: + # Let us pretend that these forms are always ordinal numerals. + # Situations where they act as total cardinals will be disambiguated + # in a subsequent call to fix_ordinal_numerals(). + node.upos = 'ADJ' + node.xpos = 'CO-' + node.feats['NumType'] = 'Ord' + node.misc['MorphInd'] = '^ke_R--+' + number.form + '_CC-$' + # Find the parent node. Assume that the dash, if present, was not the head. + if node.parent == number: + node.parent = number.parent + node.deprel = number.deprel + if re.match(r'(case|mark|det|nummod|nmod)', node.udeprel): + node.deprel = 'amod' + # Adjust SpaceAfter. + node.misc['SpaceAfter'] = 'No' if number.no_space_after else '' + # Remove the separate node of the dash and the number. + if dash: + if len(dash.children) > 0: + for c in dash.children: + c.parent = node + dash.remove() + if len(number.children) > 0: + for c in number.children: + c.parent = node + number.remove() + # There may have been spaces around the dash, which are now gone. Recompute the sentence text. + node.root.text = node.root.compute_text() + def lemmatize_verb_from_morphind(self, node): # The MISC column contains the output of MorphInd for the current word. # The analysis has been interpreted wrongly for some verbs, so we need @@ -190,6 +240,7 @@ def fix_satu_satunya(self, node): def process_node(self, node): self.fix_plural_propn(node) self.fix_upos_based_on_morphind(node) + self.rejoin_ordinal_numerals(node) self.fix_ordinal_numerals(node) self.lemmatize_verb_from_morphind(node) self.fix_satu_satunya(node) From f413c812239863b9e216bf5b258496668494fff0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 20 Sep 2021 13:39:20 +0200 Subject: [PATCH 048/871] "ke48" is a typo (although it occurs in the corpus), the correct spelling is "ke-48". --- udapi/block/ud/id/fixgsd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 2296fc31..edc71142 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -35,7 +35,7 @@ def fix_ordinal_numerals(self, node): ketujuh = seventh kedelapan = eighth kesembilan = ninth - ke48 = 48th + ke-48 = 48th However! The ke- forms (i.e., not 'pertama') can also function as total versions of cardinal numbers ('both', 'all three' etc.). If the numeral From cda5e7ade81b8d00d921e87cd52f02f6edf2c71e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 22 Sep 2021 17:42:06 +0200 Subject: [PATCH 049/871] after_process_document should print before redirecting fixes #94 --- udapi/block/write/textmodetreeshtml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index 75a39a97..9f9f6aa2 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -53,8 +53,8 @@ def before_process_document(self, document): print('%s = %s' % (key, value)) def after_process_document(self, document): - super().after_process_document(document) print("\n\n") + super().after_process_document(document) def add_node(self, idx, node): if not node.is_root(): From 1f61ca4894179d95bea29fcf2baf8c6e44ef48d3 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 2 Oct 2021 15:30:51 +0200 Subject: [PATCH 050/871] Merge tokens describing decades in Indonesian. --- udapi/block/ud/id/fixgsd.py | 52 +++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index edc71142..9634bc2d 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -120,6 +120,57 @@ def rejoin_ordinal_numerals(self, node): # There may have been spaces around the dash, which are now gone. Recompute the sentence text. node.root.text = node.root.compute_text() + def rejoin_decades(self, node): + """ + In Indonesian, the equivalent of English "1990s" is written as "1990-an". + In GSD, it is often tokenized as multiple tokens, which is wrong. Fix it. + """ + if node.form.lower() == 'an': + dash = None + number = None + if node.prev_node: + if node.prev_node.form == '-': + dash = node.prev_node + if dash.prev_node and re.match(r'^\d+$', dash.prev_node.form): + number = dash.prev_node + node.form = number.form + dash.form + node.form + node.lemma = number.lemma + dash.lemma + node.lemma + elif re.match(r'^\d+$', node.prev_node.form) and (node.parent == node.prev_node or node.prev_node.parent == node): + number = node.prev_node + node.feats['Typo'] = 'Yes' + node.misc['CorrectForm'] = number.form + '-' + node.form + node.form = number.form + node.form + node.lemma = number.lemma + '-' + node.lemma + if number: + # The combined token is no longer a numeral. It cannot quantify an entity. + # Instead, it is itself something like a noun (or perhaps proper noun). + node.upos = 'NOUN' + node.xpos = 'NSD' + node.feats['NumType'] = '' + # In some cases, "-an" is labeled as foreign for no obvious reason. + node.feats['Foreign'] = '' + node.misc['MorphInd'] = '^' + number.form + '_CC-+an_F--$' + # Find the parent node. Assume that the dash, if present, was not the head. + if node.parent == number: + node.parent = number.parent + node.deprel = number.deprel + if re.match(r'(case|mark|det|nummod|nmod)', node.udeprel): + node.deprel = 'nmod' + # No need to adjust SpaceAfter, as the 'an' node was the last one in the complex. + #node.misc['SpaceAfter'] = 'No' if number.no_space_after else '' + # Remove the separate node of the dash and the number. + if dash: + if len(dash.children) > 0: + for c in dash.children: + c.parent = node + dash.remove() + if len(number.children) > 0: + for c in number.children: + c.parent = node + number.remove() + # There may have been spaces around the dash, which are now gone. Recompute the sentence text. + node.root.text = node.root.compute_text() + def lemmatize_verb_from_morphind(self, node): # The MISC column contains the output of MorphInd for the current word. # The analysis has been interpreted wrongly for some verbs, so we need @@ -242,5 +293,6 @@ def process_node(self, node): self.fix_upos_based_on_morphind(node) self.rejoin_ordinal_numerals(node) self.fix_ordinal_numerals(node) + self.rejoin_decades(node) self.lemmatize_verb_from_morphind(node) self.fix_satu_satunya(node) From e7d7502b79a2170ecad339607d8e6b3f44dc5ba9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 2 Oct 2021 23:32:14 +0200 Subject: [PATCH 051/871] More fixes for Indonesian UD. --- udapi/block/ud/addmwt.py | 5 ++++ udapi/block/ud/id/addmwt.py | 31 ++++++++++++++++++-- udapi/block/ud/id/fixgsd.py | 57 ++++++++++++++++++++++++++++++++++--- 3 files changed, 86 insertions(+), 7 deletions(-) diff --git a/udapi/block/ud/addmwt.py b/udapi/block/ud/addmwt.py index ffa78bbb..2d251989 100644 --- a/udapi/block/ud/addmwt.py +++ b/udapi/block/ud/addmwt.py @@ -1,5 +1,6 @@ """Abstract base class ud.AddMwt for heuristic detection of multi-word tokens.""" from udapi.core.block import Block +import logging class AddMwt(Block): @@ -40,6 +41,10 @@ def process_node(self, node): if attr in analysis: values = analysis[attr].split() for i, new_node in enumerate(nodes): + if len(values) <= i: + logging.warning("Attribute '%s' not supplied for word no. %d" % (attr, i)) + for attr in 'form lemma upos xpos feats deprel misc'.split(): + logging.warning("%s = %s" % (attr, analysis.get(attr, ''))) if values[i] == '*': setattr(new_node, attr, orig_attr[attr]) elif attr == 'feats' and '*' in values[i]: diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 7f5ab271..a8d50748 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -92,13 +92,39 @@ def multiword_analysis(self, node): deprel = '* nmod:poss' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'PRON' and re.match(r'^diri(nya|ku|mu)$', node.form, re.IGNORECASE): + # dirinya = reflexive himself/herself/itself (similarly, diriku = myself, dirimu = yourself; somewhere else we should check that they have the right features) + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) + # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + upos = 'PRON PRON' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=3|PronType=Prs' + xpos = 'NSD PS3' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=1|PronType=Prs' + xpos = 'NSD PS1' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=2|PronType=Prs' + xpos = 'NSD PS2' + deprel = '* nmod:poss' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} elif node.upos == 'ADJ' and re.search(r'(nya)$', node.form, re.IGNORECASE): # nominalized adjective splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) lemma = splitform.lower() upos = 'ADJ DET' feats = '* Definite=Def|PronType=Art' - xpos = re.sub(r'\+', ' ', node.xpos) + if re.match(r' ', node.xpos): + xpos = re.sub(r'\+', ' ', node.xpos) + else: + xpos = 'ASP PS3' deprel = '* det' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} @@ -162,13 +188,12 @@ def multiword_analysis(self, node): # Do not warn about instances that are known exceptions. # akibatnya = as a result (SCONJ); akibat = result # bukannya = instead (PART); bukan = no, not - # dirinya = reflexive himself/herself/itself (similarly, diriku = myself, dirimu = yourself; somewhere else we should check that they have the right features) # layaknya = like (ADP); layak = worthy # sebaiknya = should (AUX) # sesampainya = once in / arriving at (ADP) # tidaknya = whether or not (PART); tidak = no, not # Adverbs are an exception, too. The -nya morpheme could be derivation. E.g., 'ironis' = 'ironic'; 'ironisnya' = 'ironically'. - if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|diri|layak|sebaik|sesampai|tidak)(nya|ku|mu)$', node.form, re.IGNORECASE): + if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|layak|sebaik|sesampai|tidak)(nya|ku|mu)$', node.form, re.IGNORECASE): logging.warning("Form '%s' analyzed by MorphInd as having the -nya|-ku|-mu clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos)) return None elif re.search(r'(kah|lah|pun|tah)$', node.form, re.IGNORECASE) and re.search(r'\+(kah|lah|pun|tah)_T--\$$', node.misc['MorphInd']): diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 9634bc2d..1e83c6e9 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -171,12 +171,12 @@ def rejoin_decades(self, node): # There may have been spaces around the dash, which are now gone. Recompute the sentence text. node.root.text = node.root.compute_text() - def lemmatize_verb_from_morphind(self, node): + def lemmatize_from_morphind(self, node): # The MISC column contains the output of MorphInd for the current word. # The analysis has been interpreted wrongly for some verbs, so we need # to re-interpret it and extract the correct lemma. - if node.upos == "VERB": - morphind = node.misc["MorphInd"] + morphind = node.misc['MorphInd'] + if node.upos == 'VERB': if morphind: # Remove the start and end tags from morphind. morphind = re.sub(r"^\^", "", morphind) @@ -205,6 +205,55 @@ def lemmatize_verb_from_morphind(self, node): node.lemma = lemma else: logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) + elif node.upos == 'NOUN': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r'_(N[SP]D|VSA)$', '', morphind) + # Do not proceed if there is an unexpected final XPOS tag. + if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r'\+', morphind) + # Expected prefixes are peN-, per-, ke-, ber-. + # Expected suffix is -an. + if len(morphemes) > 1 and re.match(r'^an$', morphemes[-1]): + del morphemes[-1] + if len(morphemes) > 1 and re.match(r'^(peN|per|ke|ber)$', morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r'<[a-z]+>', '', lemma) + node.lemma = lemma + elif node.upos == 'ADJ': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r'_ASS$', '', morphind) + # Do not proceed if there is an unexpected final XPOS tag. + if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r'\+', morphind) + # Expected prefix is ter-. + if len(morphemes) > 1 and re.match(r'^ter$', morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r'<[a-z]+>', '', lemma) + node.lemma = lemma + else: + logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) def merge_reduplicated_plural(self, node): # Instead of compound:plur, merge the reduplicated plurals into a single token. @@ -294,5 +343,5 @@ def process_node(self, node): self.rejoin_ordinal_numerals(node) self.fix_ordinal_numerals(node) self.rejoin_decades(node) - self.lemmatize_verb_from_morphind(node) + self.lemmatize_from_morphind(node) self.fix_satu_satunya(node) From a8c36e50d6a1877a102da2456126f4d5b67b59de Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 6 Oct 2021 00:00:24 +0200 Subject: [PATCH 052/871] prevent errors on a quote surrounded by spaces --- udapi/block/segment/simple.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udapi/block/segment/simple.py b/udapi/block/segment/simple.py index 5f4a8423..58be9b6d 100644 --- a/udapi/block/segment/simple.py +++ b/udapi/block/segment/simple.py @@ -33,8 +33,12 @@ def is_boundary(self, first, second): return False if first[-1] in '"“»›)': first = first[:-1] + if not first: + return False if second[0] in '"„«¿¡‹(': second = second[1:] + if not second: + return False if not second[0].isupper() or second[0].isdigit(): return False if not first[-1] in '.!?': From 3af3eabb17c185bc28d9fc6c4657ef384b384117 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 20 Oct 2021 18:06:52 +0200 Subject: [PATCH 053/871] Merge more reduplications in Indonesian. --- udapi/block/ud/id/fixgsd.py | 80 ++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 1e83c6e9..8f7ed20a 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -255,34 +255,65 @@ def lemmatize_from_morphind(self, node): else: logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) - def merge_reduplicated_plural(self, node): - # Instead of compound:plur, merge the reduplicated plurals into a single token. - if node.deprel == "compound:plur": - root = node.root - # We assume that the previous token is a hyphen and the token before it is the parent. - first = node.parent - if first.ord == node.ord-2 and first.form.lower() == node.form.lower(): - hyph = node.prev_node - if hyph.is_descendant_of(first) and re.match(r"^(-|–|--)$", hyph.form): - # Neither the hyphen nor the current node should have children. - # If they do, re-attach the children to the first node. - for c in hyph.children: - c.parent = first - for c in node.children: - c.parent = first - # Merge the three nodes. - first.form = first.form + "-" + node.form - first.feats["Number"] = "Plur" + def merge_reduplication(self, node): + """ + Reduplication is a common morphological device in Indonesian. Reduplicated + nouns signal plural but some reduplications also encode emphasis, modification + of meaning etc. In the previous annotation of GSD, reduplication was mostly + analyzed as three tokens, e.g., for plurals, the second copy would be attached + to the first one as compound:plur, and the hyphen would be attached to the + second copy as punct. We want to analyze reduplication as a single token. + Fix it. + """ + # We assume that the previous token is a hyphen and the token before it is the parent. + first = node.parent + if first.ord == node.ord-2 and first.form.lower() == node.form.lower(): + hyph = node.prev_node + if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): + root = node.root + # This is specific to the reduplicated plurals. The rest will be done for any reduplications. + # Note that not all reduplicated plurals had compound:plur. So we will look at whether they are NOUN. + ###!!! Also, reduplicated plural nouns always have exact copies on both sides of the hyphen. + ###!!! Some other reduplications have slight modifications on one or the other side. + if node.upos == 'NOUN' and first.form.lower() == node.form.lower(): + first.feats['Number'] = 'Plur' + # Neither the hyphen nor the current node should have children. + # If they do, re-attach the children to the first node. + for c in hyph.children: + c.parent = first + for c in node.children: + c.parent = first + # Merge the three nodes. + # It is possible that the last token of the original annotation + # is included in a multi-word token. Then we must extend the + # multi-word token to the whole reduplication! Example: + # pemeran-pemerannya (the actors) ... originally 'pemeran' and '-' + # are tokens, 'pemerannya' is a MWT split to 'pemeran' and 'nya'. + mwt = node.multiword_token + if mwt: + # We assume that the MWT has only two words. We are not prepared for other possibilities. + if len(mwt.words) > 2: + logging.critical('MWT of only two words is expected') + mwtmisc = mwt.misc.copy() + second = mwt.words[1] + mwt.remove() + first.form = first.form + '-' + node.form + hyph.remove() + node.remove() + first.misc['SpaceAfter'] = '' + mwt = root.create_multiword_token([first, second], first.form + second.form, mwtmisc) + else: + first.form = first.form + '-' + node.form if node.no_space_after: - first.misc["SpaceAfter"] = "No" + first.misc['SpaceAfter'] = 'No' else: - first.misc["SpaceAfter"] = "" + first.misc['SpaceAfter'] = '' hyph.remove() node.remove() - # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. - # If it did not, then we have a mismatch with the sentence text, which we must fix. - # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). - root.text = root.compute_text() + # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. + # If it did not, then we have a mismatch with the sentence text, which we must fix. + # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). + root.text = root.compute_text() def fix_plural_propn(self, node): """ @@ -343,5 +374,6 @@ def process_node(self, node): self.rejoin_ordinal_numerals(node) self.fix_ordinal_numerals(node) self.rejoin_decades(node) + self.merge_reduplication(node) self.lemmatize_from_morphind(node) self.fix_satu_satunya(node) From 78ffaef93cc8255f48685e47ccf0229a6d4c30c9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 20 Oct 2021 22:36:33 +0200 Subject: [PATCH 054/871] More token merging in Indonesian GSD. --- udapi/block/ud/id/fixgsd.py | 59 +++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 8f7ed20a..12ca9712 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -267,16 +267,29 @@ def merge_reduplication(self, node): """ # We assume that the previous token is a hyphen and the token before it is the parent. first = node.parent - if first.ord == node.ord-2 and first.form.lower() == node.form.lower(): + root = node.root + # Example of identical reduplication: negara-negara = countries + # Example of reduplication with -an: kopi-kopian = various coffee trees + # Example of reduplication with vowel substitution: bolak-balik = alternating + # Example of reduplication with di-: disebut-sebut = mentioned (the verb sebut is reduplicated, then passivized) + # Example of reduplication with se-: sehari-hari = daily (hari = day) + # The last pattern is not reduplication but we handle it here because the procedure is very similar: non-/sub-/anti- + a word. + if first.ord == node.ord-2 and (first.form.lower() == node.form.lower() or first.form.lower() + 'an' == node.form.lower() or re.match(r'^(.)o(.)a(.)-\1a\2i\3$', first.form.lower() + '-' + node.form.lower()) or first.form.lower() == 'di' + node.form.lower() or first.form.lower() == 'se' + node.form.lower() or re.match(r'^(non|sub|anti)$', first.form.lower())): hyph = node.prev_node if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): - root = node.root # This is specific to the reduplicated plurals. The rest will be done for any reduplications. # Note that not all reduplicated plurals had compound:plur. So we will look at whether they are NOUN. ###!!! Also, reduplicated plural nouns always have exact copies on both sides of the hyphen. ###!!! Some other reduplications have slight modifications on one or the other side. if node.upos == 'NOUN' and first.form.lower() == node.form.lower(): first.feats['Number'] = 'Plur' + # For the non-/sub-/anti- prefix we want to take the morphology from the second word. + if re.match(r'^(non|sub|anti)$', first.form.lower()): + first.lemma = first.lemma + '-' + node.lemma + first.upos = node.upos + first.xpos = node.xpos + first.feats = node.feats + first.misc['MorphInd'] = re.sub(r'\$\+\^', '+', first.misc['MorphInd'] + '+' + node.misc['MorphInd']) # Neither the hyphen nor the current node should have children. # If they do, re-attach the children to the first node. for c in hyph.children: @@ -314,6 +327,48 @@ def merge_reduplication(self, node): # If it did not, then we have a mismatch with the sentence text, which we must fix. # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). root.text = root.compute_text() + # In some cases the non-/sub-/anti- prefix is annotated as the head of the phrase and the above pattern does not catch it. + elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti)$', node.form.lower()): + prefix = node + stem = first # here it is not the first part at all + hyph = stem.prev_node + if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): + # For the non-/sub-/anti- prefix we want to take the morphology from the second word. + stem.lemma = prefix.lemma + '-' + stem.lemma + stem.misc['MorphInd'] = re.sub(r'\$\+\^', '+', prefix.misc['MorphInd'] + '+' + stem.misc['MorphInd']) + # Neither the hyphen nor the prefix should have children. + # If they do, re-attach the children to the stem. + for c in hyph.children: + c.parent = stem + for c in prefix.children: + c.parent = stem + # Merge the three nodes. + # It is possible that the last token of the original annotation + # is included in a multi-word token. Then we must extend the + # multi-word token to the whole reduplication! Example: + # pemeran-pemerannya (the actors) ... originally 'pemeran' and '-' + # are tokens, 'pemerannya' is a MWT split to 'pemeran' and 'nya'. + mwt = stem.multiword_token + if mwt: + # We assume that the MWT has only two words. We are not prepared for other possibilities. + if len(mwt.words) > 2: + logging.critical('MWT of only two words is expected') + mwtmisc = mwt.misc.copy() + second = mwt.words[1] + mwt.remove() + stem.form = prefix.form + '-' + stem.form + prefix.remove() + hyph.remove() + stem.misc['SpaceAfter'] = '' + mwt = root.create_multiword_token([stem, second], stem.form + second.form, mwtmisc) + else: + stem.form = prefix.form + '-' + stem.form + prefix.remove() + hyph.remove() + # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. + # If it did not, then we have a mismatch with the sentence text, which we must fix. + # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). + root.text = root.compute_text() def fix_plural_propn(self, node): """ From 48834686d73034293c737598d6bc12ed20907503 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 21 Oct 2021 16:31:48 +0200 Subject: [PATCH 055/871] Multi, kontra. --- udapi/block/ud/id/fixgsd.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 12ca9712..b5142040 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -274,7 +274,7 @@ def merge_reduplication(self, node): # Example of reduplication with di-: disebut-sebut = mentioned (the verb sebut is reduplicated, then passivized) # Example of reduplication with se-: sehari-hari = daily (hari = day) # The last pattern is not reduplication but we handle it here because the procedure is very similar: non-/sub-/anti- + a word. - if first.ord == node.ord-2 and (first.form.lower() == node.form.lower() or first.form.lower() + 'an' == node.form.lower() or re.match(r'^(.)o(.)a(.)-\1a\2i\3$', first.form.lower() + '-' + node.form.lower()) or first.form.lower() == 'di' + node.form.lower() or first.form.lower() == 'se' + node.form.lower() or re.match(r'^(non|sub|anti)$', first.form.lower())): + if first.ord == node.ord-2 and (first.form.lower() == node.form.lower() or first.form.lower() + 'an' == node.form.lower() or re.match(r'^(.)o(.)a(.)-\1a\2i\3$', first.form.lower() + '-' + node.form.lower()) or first.form.lower() == 'di' + node.form.lower() or first.form.lower() == 'se' + node.form.lower() or re.match(r'^(non|sub|anti|multi|kontra)$', first.form.lower())): hyph = node.prev_node if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): # This is specific to the reduplicated plurals. The rest will be done for any reduplications. @@ -284,7 +284,7 @@ def merge_reduplication(self, node): if node.upos == 'NOUN' and first.form.lower() == node.form.lower(): first.feats['Number'] = 'Plur' # For the non-/sub-/anti- prefix we want to take the morphology from the second word. - if re.match(r'^(non|sub|anti)$', first.form.lower()): + if re.match(r'^(non|sub|anti|multi|kontra)$', first.form.lower()): first.lemma = first.lemma + '-' + node.lemma first.upos = node.upos first.xpos = node.xpos @@ -328,7 +328,7 @@ def merge_reduplication(self, node): # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). root.text = root.compute_text() # In some cases the non-/sub-/anti- prefix is annotated as the head of the phrase and the above pattern does not catch it. - elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti)$', node.form.lower()): + elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti|multi|kontra')$', node.form.lower()): prefix = node stem = first # here it is not the first part at all hyph = stem.prev_node From 75ec0ff74ed2d28a09b514a38b2c25d9cbc779f9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 21 Oct 2021 16:33:40 +0200 Subject: [PATCH 056/871] Bug fix. --- udapi/block/ud/id/fixgsd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index b5142040..69c785ab 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -328,7 +328,7 @@ def merge_reduplication(self, node): # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). root.text = root.compute_text() # In some cases the non-/sub-/anti- prefix is annotated as the head of the phrase and the above pattern does not catch it. - elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti|multi|kontra')$', node.form.lower()): + elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti|multi|kontra)$', node.form.lower()): prefix = node stem = first # here it is not the first part at all hyph = stem.prev_node From ddbb2b0c459c6595da1ca42bd698e07b7fd245fe Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 23 Oct 2021 22:34:13 +0200 Subject: [PATCH 057/871] Context-based tagging of Indonesian "semua". --- udapi/block/ud/id/fixgsd.py | 197 +++++++++++++++++++----------------- 1 file changed, 105 insertions(+), 92 deletions(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 69c785ab..d328212d 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -22,6 +22,25 @@ def fix_upos_based_on_morphind(self, node): elif node.udeprel == 'advcl': node.deprel = 'obl' + def fix_semua(self, node): + """ + Indonesian "semua" means "everything, all". + Originally it was DET, PRON, or ADV. + Ika: I usually only labeled "semua" as DET only if it's followed by a + NOUN/PROPN. If it's followed by DET (including '-nya' as DET) or it's + not followed by any NOUN/DET, I labeled them as PRON. + """ + if node.form.lower() == 'semua': + if re.match(r'^(NOUN|PROPN)$', node.parent.upos) and node.parent.ord > node.ord: + node.upos = 'DET' + if node.udeprel == 'nmod' or node.udeprel == 'advmod': + node.deprel = 'det' + else: + node.upos = 'PRON' + if node.udeprel == 'det' or node.udeprel == 'advmod': + node.deprel = 'nmod' + node.feats['PronType'] = 'Tot' + def fix_ordinal_numerals(self, node): """ Ordinal numerals should be ADJ NumType=Ord in UD. They have many different @@ -62,13 +81,6 @@ def fix_ordinal_numerals(self, node): node.feats['PronType'] = 'Tot' if re.match(r'^(det|amod|nmod)$', node.udeprel): node.deprel = 'nummod' - # The following is not an ordinal numeral but I am too lazy to create a separate method for that. - elif node.form.lower() == 'semua': - # It means 'all'. Originally it was DET, PRON, or ADV. - node.upos = 'DET' - node.feats['PronType'] = 'Tot' - if node.udeprel == 'nmod' or node.udeprel == 'advmod': - node.deprel = 'det' def rejoin_ordinal_numerals(self, node): """ @@ -171,90 +183,6 @@ def rejoin_decades(self, node): # There may have been spaces around the dash, which are now gone. Recompute the sentence text. node.root.text = node.root.compute_text() - def lemmatize_from_morphind(self, node): - # The MISC column contains the output of MorphInd for the current word. - # The analysis has been interpreted wrongly for some verbs, so we need - # to re-interpret it and extract the correct lemma. - morphind = node.misc['MorphInd'] - if node.upos == 'VERB': - if morphind: - # Remove the start and end tags from morphind. - morphind = re.sub(r"^\^", "", morphind) - morphind = re.sub(r"\$$", "", morphind) - # Remove the final XPOS tag from morphind. - morphind = re.sub(r"_V[SP][AP]$", "", morphind) - # Split morphind to prefix, stem, and suffix. - morphemes = re.split(r"\+", morphind) - # Expected suffixes are -kan, -i, -an, or no suffix at all. - # There is also the circumfix ke-...-an which seems to be nominalized adjective: - # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama"; - # but I am not sure what is the reason that these are tagged VERB. - if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]): - del morphemes[-1] - # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. - # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". - while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]): - del morphemes[0] - # Check that we are left with just one morpheme. - if len(morphemes) != 1: - logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) - else: - lemma = morphemes[0] - # Remove the stem POS category. - lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma) - node.lemma = lemma - else: - logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) - elif node.upos == 'NOUN': - if morphind: - # Remove the start and end tags from morphind. - morphind = re.sub(r"^\^", "", morphind) - morphind = re.sub(r"\$$", "", morphind) - # Remove the final XPOS tag from morphind. - morphind = re.sub(r'_(N[SP]D|VSA)$', '', morphind) - # Do not proceed if there is an unexpected final XPOS tag. - if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): - # Split morphind to prefix, stem, and suffix. - morphemes = re.split(r'\+', morphind) - # Expected prefixes are peN-, per-, ke-, ber-. - # Expected suffix is -an. - if len(morphemes) > 1 and re.match(r'^an$', morphemes[-1]): - del morphemes[-1] - if len(morphemes) > 1 and re.match(r'^(peN|per|ke|ber)$', morphemes[0]): - del morphemes[0] - # Check that we are left with just one morpheme. - if len(morphemes) != 1: - logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) - else: - lemma = morphemes[0] - # Remove the stem POS category. - lemma = re.sub(r'<[a-z]+>', '', lemma) - node.lemma = lemma - elif node.upos == 'ADJ': - if morphind: - # Remove the start and end tags from morphind. - morphind = re.sub(r"^\^", "", morphind) - morphind = re.sub(r"\$$", "", morphind) - # Remove the final XPOS tag from morphind. - morphind = re.sub(r'_ASS$', '', morphind) - # Do not proceed if there is an unexpected final XPOS tag. - if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): - # Split morphind to prefix, stem, and suffix. - morphemes = re.split(r'\+', morphind) - # Expected prefix is ter-. - if len(morphemes) > 1 and re.match(r'^ter$', morphemes[0]): - del morphemes[0] - # Check that we are left with just one morpheme. - if len(morphemes) != 1: - logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) - else: - lemma = morphemes[0] - # Remove the stem POS category. - lemma = re.sub(r'<[a-z]+>', '', lemma) - node.lemma = lemma - else: - logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) - def merge_reduplication(self, node): """ Reduplication is a common morphological device in Indonesian. Reduplicated @@ -423,12 +351,97 @@ def fix_satu_satunya(self, node): if node.multiword_token and node.no_space_after: node.misc['SpaceAfter'] = '' + def lemmatize_from_morphind(self, node): + # The MISC column contains the output of MorphInd for the current word. + # The analysis has been interpreted wrongly for some verbs, so we need + # to re-interpret it and extract the correct lemma. + morphind = node.misc['MorphInd'] + if node.upos == 'VERB': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r"_V[SP][AP]$", "", morphind) + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r"\+", morphind) + # Expected suffixes are -kan, -i, -an, or no suffix at all. + # There is also the circumfix ke-...-an which seems to be nominalized adjective: + # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama"; + # but I am not sure what is the reason that these are tagged VERB. + if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]): + del morphemes[-1] + # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. + # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". + while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma) + node.lemma = lemma + else: + logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) + elif node.upos == 'NOUN': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r'_(N[SP]D|VSA)$', '', morphind) + # Do not proceed if there is an unexpected final XPOS tag. + if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r'\+', morphind) + # Expected prefixes are peN-, per-, ke-, ber-. + # Expected suffix is -an. + if len(morphemes) > 1 and re.match(r'^an$', morphemes[-1]): + del morphemes[-1] + if len(morphemes) > 1 and re.match(r'^(peN|per|ke|ber)$', morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r'<[a-z]+>', '', lemma) + node.lemma = lemma + elif node.upos == 'ADJ': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r'_ASS$', '', morphind) + # Do not proceed if there is an unexpected final XPOS tag. + if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r'\+', morphind) + # Expected prefix is ter-. + if len(morphemes) > 1 and re.match(r'^ter$', morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r'<[a-z]+>', '', lemma) + node.lemma = lemma + else: + logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) + def process_node(self, node): self.fix_plural_propn(node) self.fix_upos_based_on_morphind(node) + self.fix_semua(node) self.rejoin_ordinal_numerals(node) self.fix_ordinal_numerals(node) self.rejoin_decades(node) self.merge_reduplication(node) - self.lemmatize_from_morphind(node) self.fix_satu_satunya(node) + self.lemmatize_from_morphind(node) From fc63be3266eccf252597383baa0687e536b0b3d1 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 29 Oct 2021 22:00:20 +0200 Subject: [PATCH 058/871] Added a block to fix tokenization in AnCora (but beware of #95 until it's fixed). --- udapi/block/ud/es/fixexclamation.py | 47 +++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 udapi/block/ud/es/fixexclamation.py diff --git a/udapi/block/ud/es/fixexclamation.py b/udapi/block/ud/es/fixexclamation.py new file mode 100644 index 00000000..7dea8e0d --- /dev/null +++ b/udapi/block/ud/es/fixexclamation.py @@ -0,0 +1,47 @@ +"""Block to fix tokenization of exclamation marks in UD Spanish-AnCora.""" +from udapi.core.block import Block +import logging +import re + +class FixExclamation(Block): + + def process_node(self, node): + """ + In Spanish AnCora, there are things like '¡Hola!' as one token. + The punctuation should be separated. One may question whether this + should include names of companies (Yahoo!) or products (la revista + Hello!) but it should, as company and product names often have + multiple tokens (even multiple full words, not just punctuation) + and these are also separated in UD. + """ + if re.search(r'^[¡!]\w', node.form): + # Separate the punctuation and attach it to the rest. + punct = node.create_child() + punct.shift_before_node(node) + punct.form = node.form[:1] + node.form = node.form[1:] + punct.lemma = punct.form + punct.upos = 'PUNCT' + punct.xpos = 'faa' if punct.form == '¡' else 'fat' + punct.feats['PunctType'] = 'Excl' + punct.feats['PunctSide'] = 'Ini' if punct.form == '¡' else 'Fin' + punct.misc['SpaceAfter'] = 'No' + punct.deprel = 'punct' + # Mark the position for manual check. + node.misc['Mark'] = 'PunctSep' + if re.search(r'\w[¡!]$', node.form): + # Separate the punctuation and attach it to the rest. + punct = node.create_child() + punct.shift_after_node(node) + punct.form = node.form[-1:] + node.form = node.form[:-1] + punct.lemma = punct.form + punct.upos = 'PUNCT' + punct.xpos = 'faa' if punct.form == '¡' else 'fat' + punct.feats['PunctType'] = 'Excl' + punct.feats['PunctSide'] = 'Ini' if punct.form == '¡' else 'Fin' + punct.misc['SpaceAfter'] = node.misc['SpaceAfter'] + node.misc['SpaceAfter'] = 'No' + punct.deprel = 'punct' + # Mark the position for manual check. + node.misc['Mark'] = 'PunctSep' From 49f1f4385e4006895708f4e6f22ea06c3bc3723f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Oct 2021 23:17:19 +0200 Subject: [PATCH 059/871] Fix leaf-aux-cop. --- udapi/block/ud/fixleaf.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 udapi/block/ud/fixleaf.py diff --git a/udapi/block/ud/fixleaf.py b/udapi/block/ud/fixleaf.py new file mode 100644 index 00000000..4cac1175 --- /dev/null +++ b/udapi/block/ud/fixleaf.py @@ -0,0 +1,35 @@ +""" +Block ud.FixLeaf checks that function word dependents are leaves. +Certain known exceptions are observed (e.g., fixed expressions). +""" +from udapi.core.block import Block +import logging +import re + +class FixLeaf(Block): + """ + Make sure that aux and cop dependents are leaves unless one of the known + exceptions applies. + """ + + def __init__(self, deprels='aux,cop', **kwargs): + """ + Args: + deprels: comma-separated list of deprels to be fixed. Default = aux,cop. + """ + super().__init__(**kwargs) + self.deprels = deprels.split(',') + + def process_node(self, node): + for deprel in self.deprels: + if node.udeprel == deprel: + children = node.children + # Every function dependent can have a fixed child. + # We will also allow conj, cc, punct, goeswith, reparandum. + children = [c for c in children if not re.match(r'^(fixed|conj|cc|punct|goeswith|reparandum)$', c.udeprel)] + # Re-attach the remaining children to an acceptable ancestor. + ancestor = node.parent + while ancestor.udeprel in self.deprels: + ancestor = ancestor.parent + for c in children: + c.parent = ancestor From 0c09c4e666b1cc9923411492f7985ffb00fb88d2 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 31 Oct 2021 09:30:20 +0100 Subject: [PATCH 060/871] FixLeaf must update enhanced relations, too. --- udapi/block/ud/fixleaf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/udapi/block/ud/fixleaf.py b/udapi/block/ud/fixleaf.py index 4cac1175..345b68f9 100644 --- a/udapi/block/ud/fixleaf.py +++ b/udapi/block/ud/fixleaf.py @@ -33,3 +33,8 @@ def process_node(self, node): ancestor = ancestor.parent for c in children: c.parent = ancestor + # If there are enhanced dependencies, check whether we want to redirect them too. + if c.deps: + for edep in c.deps: + if edep['parent'] == node: + edep['parent'] = ancestor From 9528d7cf5d4927c64fba305a0ced8b32449fec4a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 3 Nov 2021 14:41:54 +0100 Subject: [PATCH 061/871] FixLeaf cc. --- udapi/block/ud/fixleaf.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/udapi/block/ud/fixleaf.py b/udapi/block/ud/fixleaf.py index 345b68f9..d715ec01 100644 --- a/udapi/block/ud/fixleaf.py +++ b/udapi/block/ud/fixleaf.py @@ -12,10 +12,10 @@ class FixLeaf(Block): exceptions applies. """ - def __init__(self, deprels='aux,cop', **kwargs): + def __init__(self, deprels='aux,cop,cc', **kwargs): """ Args: - deprels: comma-separated list of deprels to be fixed. Default = aux,cop. + deprels: comma-separated list of deprels to be fixed. Default = aux,cop,case,mark,cc. """ super().__init__(**kwargs) self.deprels = deprels.split(',') @@ -23,10 +23,12 @@ def __init__(self, deprels='aux,cop', **kwargs): def process_node(self, node): for deprel in self.deprels: if node.udeprel == deprel: - children = node.children # Every function dependent can have a fixed child. # We will also allow conj, cc, punct, goeswith, reparandum. - children = [c for c in children if not re.match(r'^(fixed|conj|cc|punct|goeswith|reparandum)$', c.udeprel)] + allowed = ['fixed', 'punct', 'goeswith', 'reparandum'] + if deprel != 'cc': + allowed += ['conj', 'cc'] + children = [c for c in node.children if not (c.udeprel in allowed)] # Re-attach the remaining children to an acceptable ancestor. ancestor = node.parent while ancestor.udeprel in self.deprels: From f1028cbc26cd627308fc7f7dd14c20ed9246c114 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 28 Nov 2021 11:32:52 +0100 Subject: [PATCH 062/871] Javanese does not distinguish VerbForms, either. --- udapi/block/ud/markbugs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index cbd57eef..5ca0f703 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -118,7 +118,7 @@ def process_node(self, node): if upos == i_upos and not feats[i_feat]: # Some languages do not distinguish finite and non-finite forms of verbs. # The VerbForm feature is not obligatory in those languages. - if i_feat != "VerbForm" or not node.root.zone.split("_")[0] in {"id", "tl", "hil", "ifb"}: + if i_feat != 'VerbForm' or not node.root.zone.split('_')[0] in {'id', 'jv', 'tl', 'hil', 'ifb'}: self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat)) if feats['VerbForm'] == 'Fin': From d1da0b0a45fe37ce6715acee6a2dfbdb4591b264 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 29 Nov 2021 10:52:45 +0100 Subject: [PATCH 063/871] Added a block to fix German GSD. --- udapi/block/ud/de/fixgsd.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 udapi/block/ud/de/fixgsd.py diff --git a/udapi/block/ud/de/fixgsd.py b/udapi/block/ud/de/fixgsd.py new file mode 100644 index 00000000..d6853330 --- /dev/null +++ b/udapi/block/ud/de/fixgsd.py @@ -0,0 +1,37 @@ +""" +Block to fix annotation of UD German-GSD. +""" +from udapi.core.block import Block +import logging +import re + +class FixGSD(Block): + + def process_node(self, node): + """ + Normalizes tokenization, lemmatization and tagging of ordinal numerals + that are expressed using digits followed by a period. + https://github.com/UniversalDependencies/UD_German-GSD/issues/24 + """ + # Ignore periods that terminate a sentence, although they could belong + # to an ordinal numeral at the same time. + if node.form == '.' and node.next_node: + # Ignore number+period combinations that have an intervening space. + if node.prev_node and re.match('^\d+$', node.prev_node.form) and node.prev_node.no_space_after: + # Merge the number and the period into one token. + number = node.prev_node + period = node + # The period should not have any children but if it does, re-attach them to the number. + for c in period.children: + c.parent = number + # The period should be followed by a space but if it isn't, mark it at the number. + number.misc['SpaceAfter'] = 'No' if period.no_space_after else '' + number.form += '.' + number.lemma = number.form + number.upos = 'ADJ' + number.xpos = 'ADJA' + number.feats = '_' + number.feats['NumType'] = 'Ord' + if number.udeprel == 'nummod': + number.deprel = 'amod' + period.remove() From fa25c0ed4a25026270c5a7f96287804c7a1e89cd Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 29 Nov 2021 11:44:36 +0100 Subject: [PATCH 064/871] More fixes of ordinals in German. --- udapi/block/ud/de/fixgsd.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/de/fixgsd.py b/udapi/block/ud/de/fixgsd.py index d6853330..65d12681 100644 --- a/udapi/block/ud/de/fixgsd.py +++ b/udapi/block/ud/de/fixgsd.py @@ -17,7 +17,7 @@ def process_node(self, node): # to an ordinal numeral at the same time. if node.form == '.' and node.next_node: # Ignore number+period combinations that have an intervening space. - if node.prev_node and re.match('^\d+$', node.prev_node.form) and node.prev_node.no_space_after: + if node.prev_node and re.match(r'^\d+$', node.prev_node.form) and node.prev_node.no_space_after: # Merge the number and the period into one token. number = node.prev_node period = node @@ -35,3 +35,24 @@ def process_node(self, node): if number.udeprel == 'nummod': number.deprel = 'amod' period.remove() + # Even if the digits and the period are already in one token, check their annotation. + if re.match(r'^\d+\.$', node.form): + node.lemma = node.form + node.upos = 'ADJ' + node.xpos = 'ADJA' + node.feats = '_' + node.feats['NumType'] = 'Ord' + if node.udeprel == 'nummod': + node.deprel = 'amod' + # Finally, make sure that ordinal numerals expressed verbosely are tagged properly. + # Unlike for digits, do not remove the features for Gender, Number, and Case. + # Skip 'acht' because we cannot reliably distinguish it from the cardinal numeral and from the verb 'achten'. + if re.match(r'^(erst|zweit|dritt|viert|fünft|sechst|siebt|neunt|(drei|vier|fünf|sechs|sieb|acht|neun)?zehnt|elft|zwölft)(er)?$', node.lemma, re.IGNORECASE): + # Skip 'erst' that is used as an adverb. + if node.lemma != 'erst' or node.upos != 'ADV': + node.lemma = re.sub(r'^(.+)er$', r'\1', node.lemma) + node.upos = 'ADJ' + node.xpos = 'ADJA' + node.feats['NumType'] = 'Ord' + if node.udeprel == 'nummod': + node.deprel = 'amod' From d7da77817a3bbda9f1b3d388232cdfcd4a734999 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 29 Nov 2021 16:34:57 +0100 Subject: [PATCH 065/871] Block's process_node iterates over a copy of descendants This prevents an infinite loop in tutorial.AddCommas etc. Partial revert of 44c291b930fa591477c87457a17c0e76e6ee22ea The slowdown is acceptable (about 0.05s per iterating over 700k words). That said, there may be usecases where iterating over _descendants is beneficial, e.g. when deleting nodes, so that we don't iterate over an already deleted node. --- udapi/core/block.py | 6 +++++- udapi/core/document.py | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/udapi/core/block.py b/udapi/core/block.py index 64b8bcc5..32033cde 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -29,7 +29,11 @@ def process_node(self, _): def process_tree(self, tree): """Process a UD tree""" - for node in tree._descendants: + # tree.descendants is slightly slower than tree._descendants (0.05s per iterating over 700k words), + # but it seems safer to iterate over a copy of the list of nodes. + # If a user calls parent.create_child().shift_before_node(parent) in process_node, + # it may end up in endless cycle (because the same node is processed again - Python for cycle remembers the position). + for node in tree.descendants: self.process_node(node) def process_bundle(self, bundle): diff --git a/udapi/core/document.py b/udapi/core/document.py index f02f831e..8f9ce3ea 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -91,7 +91,9 @@ def nodes(self): """An iterator over all nodes (excluding empty nodes) in the document.""" for bundle in self: for tree in bundle: - for node in tree._descendants: + # tree.descendants is slightly slower than tree._descendants, + # but it seems safer, see the comment in udapi.core.block.Block.process.process_tree(). + for node in tree.descendants: yield node @property From 7381264e9aafbd727de03ea25f6e1e862fdd83b9 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 1 Dec 2021 23:22:33 +0100 Subject: [PATCH 066/871] update the conversion block to UD_English-GUM v2.9 --- udapi/block/corefud/gum2corefud.py | 116 ++++++++++++++++------------- 1 file changed, 64 insertions(+), 52 deletions(-) diff --git a/udapi/block/corefud/gum2corefud.py b/udapi/block/corefud/gum2corefud.py index 95be6ce0..bcd24968 100644 --- a/udapi/block/corefud/gum2corefud.py +++ b/udapi/block/corefud/gum2corefud.py @@ -8,82 +8,94 @@ class Gum2CorefUD(Block): def process_tree(self, tree): docname = tree.bundle.document.meta['docname'] + '_' - def entity2cluster_id(name): - return docname + name.strip('()').replace(',','').replace('+','') - clusters = tree.bundle.document.coref_clusters unfinished_mentions = defaultdict(list) for node in tree.descendants: - entity = node.misc['Entity'] - if not entity: + misc_entity = node.misc['Entity'] + if not misc_entity: continue - parts = [x for x in re.split('(\([^())]+\)?|[^())]+\))', entity) if x] - for part in parts: - # GUM entity name could be e.g. - # abstract-173 or place-1-Coron,_Palawan or place-77-Sub-Saharan_Africa. - # Note that the wikification part of the name may contain commas and dashes. - # Let's take the whole name as cluster_id, which will be normalized later on. - # We just need to remove commas and plus signs which are forbidden in cluster_id - # because they are used as separators in Bridging and SplitAnte, respectively. - # Let's store the type in cluster.cluster_type and Wikification in mention.misc. - name = entity2cluster_id(part) - if part[0] == '(': + # Attribute Entity may contain multiple entities, e.g. + # Entity=(abstract-7-new-2-coref(abstract-3-giv:act-1-coref) + # means a start of entity id=7 and start&end (i.e. single-word mention) of entity id=3. + # The following re.split line splits this into + # entities = ["(abstract-7-new-2-coref", "(abstract-3-giv:act-1-coref)"] + entities = [x for x in re.split('(\([^()]+\)?|[^()]+\))', misc_entity) if x] + for entity in entities: + # GUM 2.9 uses global.Entity = entity-GRP-infstat-MIN-coref_type-identity + # but the closing tag is shortent just to GRP. + opening, closing = (entity[0] == '(', entity[-1] == ')') + entity = entity.strip('()') + if not opening and not closing: + logging.warning(f"Entity {entity} at {node} has no opening nor closing bracket.") + elif not opening and closing: + name = docname + entity + if not unfinished_mentions[name]: + raise ValueError(f"Mention {name} closed at {node}, but not opened in the same tree.") + else: + mention = unfinished_mentions[name].pop() + mention.span = f'{mention.head.ord}-{node.ord}' + else: + attrs = entity.split('-') + if len(attrs) == 6: + etype, grp, infstat, minspan, ctype, wiki = attrs + elif len(attrs) == 5: + wiki = None + etype, grp, infstat, minspan, ctype = attrs + elif len(attrs) > 6: + logging.warning(f"Entity {entity} at {node} has more than 6 attributes.") + etype, grp, infstat, minspan, ctype, wiki = entity.split('-', maxsplit=5) + else: + raise ValueError(f"Less than 5 attributes in {entity} at {node}") + name = docname + grp cluster = clusters.get(name) if cluster is None: - chunks = part.strip('()').split('-', maxsplit=2) - if len(chunks) == 3: - ctype, _, wiki = chunks - elif len(chunks) == 2: - ctype, _, wiki = chunks[0], None, None - else: - raise ValueError(f"Unexpected entity {part} at {node}") - cluster = node.create_coref_cluster(cluster_id=name, cluster_type=ctype) + cluster = node.create_coref_cluster(cluster_id=name, cluster_type=etype) mention = cluster.mentions[0] + mention.misc = f"Infstat:{infstat},MinSpan:{minspan},CorefType:{ctype}" if wiki: - mention.misc = 'Wikification:' + wiki.replace(',', '%2C') + mention.misc += ',Wikification:' + wiki #.replace(',', '%2C') else: mention = cluster.create_mention(head=node) - if part[-1] == ')': + if closing: mention.words = [node] else: unfinished_mentions[name].append(mention) - elif part[-1] == ')': - if not unfinished_mentions[name]: - logging.warning(f"Mention {name} closed at {node}, but not opened in the same tree.") - else: - mention = unfinished_mentions[name].pop() - mention.span = f'{mention.head.ord}-{node.ord}' del node.misc['Entity'] - misc_bridge = node.misc['Bridge'] - if misc_bridge: - # E.g. Entity=event-23|Bridge=time-23 Date: Mon, 6 Dec 2021 18:23:42 +0100 Subject: [PATCH 067/871] Block to fix UD validation of CorefUD 0.2 - this block must be run to fix the trees in CorefUD so they pass the current UD validator - so far it fixes the following issues: - the node with 0 parent must have DEPREL=root - there must be a space before newdoc or newpar --- udapi/block/corefud/fixtovalidate.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 udapi/block/corefud/fixtovalidate.py diff --git a/udapi/block/corefud/fixtovalidate.py b/udapi/block/corefud/fixtovalidate.py new file mode 100644 index 00000000..8207835e --- /dev/null +++ b/udapi/block/corefud/fixtovalidate.py @@ -0,0 +1,24 @@ +from udapi.core.block import Block + +class FixToValidate(Block): + """This block fixes the CorefUD data so that the final documents are valid conllu files.""" + + def _set_root_deprel(self, node): + if node.parent == node.root and node.deprel != "root": + node.deprel = "root" + print(node) + + def _space_before_pardoc(self, doc): + last_node = None + for i, tree in enumerate(doc.trees): + if i > 0: + if (tree.newdoc is not None or tree.newpar is not None) and last_node.no_space_after: + del last_node.misc["SpaceAfter"] + print(tree) + last_node = tree.descendants[-1] + + def process_node(self, node): + self._set_root_deprel(node) + + def process_document(self, doc): + self._space_before_pardoc(doc) From 1b437618b1a5a40de1da261d8ed56df6a0097d86 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 7 Dec 2021 13:53:41 +0100 Subject: [PATCH 068/871] forgotten commit: no debug prints, _set_root_deprel must be called from process_document, otherwise it's not called at all --- udapi/block/corefud/fixtovalidate.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/udapi/block/corefud/fixtovalidate.py b/udapi/block/corefud/fixtovalidate.py index 8207835e..3af37490 100644 --- a/udapi/block/corefud/fixtovalidate.py +++ b/udapi/block/corefud/fixtovalidate.py @@ -3,10 +3,10 @@ class FixToValidate(Block): """This block fixes the CorefUD data so that the final documents are valid conllu files.""" - def _set_root_deprel(self, node): - if node.parent == node.root and node.deprel != "root": - node.deprel = "root" - print(node) + def _set_root_deprel(self, doc): + for node in doc.nodes: + if node.parent == node.root and node.deprel != "root": + node.deprel = "root" def _space_before_pardoc(self, doc): last_node = None @@ -14,11 +14,8 @@ def _space_before_pardoc(self, doc): if i > 0: if (tree.newdoc is not None or tree.newpar is not None) and last_node.no_space_after: del last_node.misc["SpaceAfter"] - print(tree) last_node = tree.descendants[-1] - def process_node(self, node): - self._set_root_deprel(node) - def process_document(self, doc): + self._set_root_deprel(doc) self._space_before_pardoc(doc) From 2f9c7e480d19b6d0e23013cbbb88b218a5aed154 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 7 Dec 2021 18:43:11 +0100 Subject: [PATCH 069/871] good point by @martinpopel: iterating over root's children is more efficient --- udapi/block/corefud/fixtovalidate.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/udapi/block/corefud/fixtovalidate.py b/udapi/block/corefud/fixtovalidate.py index 3af37490..87421688 100644 --- a/udapi/block/corefud/fixtovalidate.py +++ b/udapi/block/corefud/fixtovalidate.py @@ -4,9 +4,10 @@ class FixToValidate(Block): """This block fixes the CorefUD data so that the final documents are valid conllu files.""" def _set_root_deprel(self, doc): - for node in doc.nodes: - if node.parent == node.root and node.deprel != "root": - node.deprel = "root" + for root in doc.trees: + for node in root.children: + if node.deprel != "root": + node.deprel = "root" def _space_before_pardoc(self, doc): last_node = None From c404eb98e7276fce10fa1c0d13569e34e69df5d4 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Thu, 9 Dec 2021 18:35:27 +0100 Subject: [PATCH 070/871] fixing the root-is-not-0 UD validation errors for some of the automatically parsed datasets --- udapi/block/corefud/fixtovalidate.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/udapi/block/corefud/fixtovalidate.py b/udapi/block/corefud/fixtovalidate.py index 87421688..48a3608d 100644 --- a/udapi/block/corefud/fixtovalidate.py +++ b/udapi/block/corefud/fixtovalidate.py @@ -9,6 +9,22 @@ def _set_root_deprel(self, doc): if node.deprel != "root": node.deprel = "root" + def _unset_root_deprel(self, doc): + for node in doc.nodes: + parent = node.parent + if node.deprel == "root" and parent is not None and not parent.is_root(): + #print("\t".join(['Non-0-root:', node.address(), node.upos, str(node.feats), node.parent.upos, str(node.parent.feats)])) + if parent.upos == "PUNCT" and parent.parent is not None: + node.parent = parent.parent + if node.upos == "CCONJ": + node.deprel = "cc" + elif node.upos == "ADJ" and parent.upos == "PROPN": + node.deprel = "amod" + elif node.upos == "NOUN" and parent.upos == "VERB": + node.deprel = "obl" + else: + node.deprel = "parataxis" + def _space_before_pardoc(self, doc): last_node = None for i, tree in enumerate(doc.trees): @@ -19,4 +35,5 @@ def _space_before_pardoc(self, doc): def process_document(self, doc): self._set_root_deprel(doc) + self._unset_root_deprel(doc) self._space_before_pardoc(doc) From 46022c6509eebfdb71196fc9f56bfe47a2197740 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 19 Dec 2021 23:40:48 +0100 Subject: [PATCH 071/871] Heuristics to fix wrong edeprels in Czech. --- udapi/block/ud/cs/fixedeprels.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 udapi/block/ud/cs/fixedeprels.py diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py new file mode 100644 index 00000000..d4455235 --- /dev/null +++ b/udapi/block/ud/cs/fixedeprels.py @@ -0,0 +1,21 @@ +"""Block to fix case-enhanced dependency relations in Czech.""" +from udapi.core.block import Block +import logging +import re + +class FixEdeprels(Block): + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Czech basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + if edep['deprel'] eq 'nmod:na': + # The case is unknown. We need 'acc' or 'loc'. + # The locative is probably more frequent but it is not so likely with every noun. + if re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): + edep['deprel'] = 'nmod:na:acc' + else + edep['deprel'] = 'nmod:na:loc' From 0d409b6cf86a3cf5881a9dfb37e0f39fa3409cb5 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 19 Dec 2021 23:42:21 +0100 Subject: [PATCH 072/871] Bug fix. --- udapi/block/ud/cs/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index d4455235..9888e51f 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -12,7 +12,7 @@ def process_node(self, node): abbreviation and its morphological case is unknown. """ for edep in node.deps: - if edep['deprel'] eq 'nmod:na': + if edep['deprel'] == 'nmod:na': # The case is unknown. We need 'acc' or 'loc'. # The locative is probably more frequent but it is not so likely with every noun. if re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): From 71d8a216ba88d3af9fb05c2fe5abd224d561776d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 19 Dec 2021 23:43:24 +0100 Subject: [PATCH 073/871] Bug fix. --- udapi/block/ud/cs/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 9888e51f..5c4be62e 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -17,5 +17,5 @@ def process_node(self, node): # The locative is probably more frequent but it is not so likely with every noun. if re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): edep['deprel'] = 'nmod:na:acc' - else + else: edep['deprel'] = 'nmod:na:loc' From 035673ea78eb5c2ec8e6fa92c1a3b8e32ca9247d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 24 Dec 2021 14:57:46 +0100 Subject: [PATCH 074/871] Fix Czech edeprels. --- udapi/block/ud/cs/fixedeprels.py | 372 ++++++++++++++++++++++++++++++- 1 file changed, 366 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 5c4be62e..ddcdb6d3 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -12,10 +12,370 @@ def process_node(self, node): abbreviation and its morphological case is unknown. """ for edep in node.deps: - if edep['deprel'] == 'nmod:na': - # The case is unknown. We need 'acc' or 'loc'. - # The locative is probably more frequent but it is not so likely with every noun. - if re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): - edep['deprel'] = 'nmod:na:acc' + if re.match(r'^(acl|advcl):', edep['deprel']): + edep['deprel'] = re.sub(r'^(advcl):a_jestliže$', r'\1:jestliže', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):a_pokud$', r'\1:pokud', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):ačkoliv$', r'\1:ačkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jak_aby$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jak_ad$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jakkoliv$', r'\1:jakkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jen_když$', r'\1:když', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jen_pokud$', r'\1:pokud', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jestli_že$', r'\1:jestliže', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jestliže_tedy$', r'\1:jestliže', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):k$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):kdykoliv$', r'\1:kdykoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):když_už$', r'\1:když', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' + edep['deprel'] = re.sub(r'^(advcl):například_když$', r'\1:když', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):od$', r'nmod:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):podle$', r'obl:podle:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):pokud_totiž$', r'\1:pokud', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):pokud_však$', r'\1:pokud', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):pro$', r'obl:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):protože_pokud$', r'\1:pokud', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):v$', r'nmod:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):v_případ_že$', r'\1:v_případě_že', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):v_duch$', r'obl:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):že_ať$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):že_jako$', r'\1:že', edep['deprel']) + if edep['deprel'] == 'acl:v' and node.form == 'patře': + edep['deprel'] = 'nmod:v:loc' + node.deprel = 'nmod' + node.lemma = 'patro' + node.upos = 'NOUN' + node.xpos = 'NNNS6-----A----' + node.feats['Aspect'] = '' + node.feats['Gender'] = 'Neut' + node.feats['Tense'] = '' + node.feats['VerbForm'] = '' + node.feats['Voice'] = '' + edep['deprel'] = re.sub(r'^advcl:(od|do)$', r'obl:\1:gen', edep['deprel']) + elif re.match(r'^(nmod|obl):', edep['deprel']): + if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': + # This is a same-case noun-noun modifier, which just happens to be in the locative. + # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has + # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. + edep['deprel'] = 'nmod' + elif edep['deprel'] == 'obl:loc': + # Annotation error. The first occurrence in PDT dev: + # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' + # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. + # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. + edep['deprel'] = 'obl:v:loc' + elif edep['deprel'] == 'obl:arg:loc': + # Annotation error. The first occurrence in PDT dev: + edep['deprel'] = 'obl:arg:na:loc' + elif edep['deprel'] == 'nmod:loc': + # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': + # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? + # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. + edep['deprel'] = 'obl' + elif edep['deprel'] == 'nmod:voc': + # 'v 8. čísle tiskoviny Ty rudá krávo' + edep['deprel'] = 'nmod:nom' + elif re.match(r'^(nmod|obl(:arg)?):během$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):bez$', edep['deprel']): + edep['deprel'] += ':gen' + elif edep['deprel'] == 'nmod:co:nom': + # Annotation error: 'kompatibilní znamená tolik co slučitelný' + # 'co' should be relative pronoun rather than subordinating conjunction. + edep['deprel'] = 'acl:relcl' + node.deprel = 'acl:relcl' + elif re.match(r'^(nmod|obl(:arg)?):díky$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):do$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):k(:nom)?$', edep['deprel']): + edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + ':dat' + elif re.match(r'^(nmod|obl(:arg)?):kolem$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):kromě$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): + edep['deprel'] = 'advcl:li' + elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):mimo$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):místo$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^obl:místo_za:acc$', edep['deprel']): + # 'chytají krávu místo za rohy spíše za ocas' + # This should be treated as coordination; 'místo' and 'spíše' are adverbs (???); 'case' for 'místo' does not seem to be the optimal solution. + for c in node.children: + if c.form == 'místo': + c.upos = 'ADV' + c.deprel = 'cc' + edep['deprel'] = 'obl:za:acc' + elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) + elif re.match(r'^(nmod|obl(:arg)?):na$', edep['deprel']): + # The case is unknown. We need 'acc' or 'loc'. + # The locative is probably more frequent but it is not so likely with every noun. + # If there is an nummod:gov child, it must be accusative and not locative. + # (The case would be taken from the number but if it is expressed as digits, it does not have the case feature.) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + elif re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:arg:na_konec$', edep['deprel']): + # Annotation error. It should have been two prepositional phrases: 'snížil na 225 tisíc koncem minulého roku' + edep['deprel'] = 'obl:arg:na:acc' + elif re.match(r'^(nmod|obl(:arg)?):nad$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):namísto$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):navzdory$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):od$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):okolo$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):oproti$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^nmod:pára:nom$', edep['deprel']): + # Annotation error: 'par excellence'. + edep['deprel'] = 'nmod' + for c in node.children: + if c.udeprel == 'case' and c.form.lower() == 'par': + c.lemma = 'par' + c.upos = 'ADP' + c.xpos = 'RR--X----------' + c.feats['Case'] = '' + c.feats['Gender'] = '' + c.feats['Number'] = '' + c.feats['Polarity'] = '' + c.feats['AdpType'] = 'Prep' + elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): + ###!!! Taky bychom se mohli dívat do XPOS předložky, protože tam bude pád uveden! + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):poblíž$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):podle$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):pro$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):proti$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): + # Accusative would be possible but unlikely. + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):přes$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):při$', edep['deprel']): + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): + # Genitive would be possible but unlikely. + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):skrz$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):u$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):uprostřed$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):uvnitř$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): + # ':nom' occurs in 'karneval v Rio de Janeiro' + edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:v_čel[eo]_s:ins$', edep['deprel']): + # There is just one occurrence and it is an error: + # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' + # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. + edep['deprel'] = 'obl:s:ins' + elif re.match(r'^(nmod|obl(:arg)?):včetně$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):vedle$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):vůči$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):z$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): + # Instrumental would be possible but unlikely. + edep['deprel'] += ':acc' else: - edep['deprel'] = 'nmod:na:loc' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):aby_na:loc$', r'\1:na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_v(:loc)?$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta:ins$', r'\1:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta(:gen)?$', r'\1:cestou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko(:nom)?$', r'\1:nedaleko:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko_od(:gen)?$', r'\1:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):dík(:dat)?$', r'\1:díky:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do:nom$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_k:dat$', r'\1:k:dat', edep['deprel']) # do maloobchodní sítě (nebo k dalšímu zpracování) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_rozpor_s(:ins)?$', r'\1:do_rozporu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_soulad_s(:ins)?$', r'\1:do_souladu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?forma(:gen)?$', r'\1:formou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby_pod:ins$', r'\1:pod:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k_konec(:gen)?$', r'\1:ke_konci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):kol(em)?(:gen)?$', r'\1:kolem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):konec(:gen)?$', r'\1:koncem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi_uvnitř:gen$', r'\1:uvnitř:gen', edep['deprel']) # 'nejdou mezi, ale uvnitř odvětví a oborů' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na(:gen|:nom)$', r'\1:na:acc', edep['deprel']) # 'odložit na 1. září' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_čelo(:gen)?$', r'\1:na_čele:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_mimo:loc$', r'\1:na:loc', edep['deprel']) # 'na kurtě i mimo něj' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_?rozdíl_od(:gen)?$', r'\1:na_rozdíl_od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úroveň(:gen)?$', r'\1:na_úrovni:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základ(:gen)?$', r'\1:na_základě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_závěr(:gen)?$', r'\1:na_závěr:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):namísto_do(:gen)?$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):následek(:gen)?$', r'\1:následkem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ne)?daleko(:gen)?$', r'\1:nedaleko:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):než[_:].+$', r'\1:než', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):nežli[_:].+$', r'\1:nežli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_jako[_:].+$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_o:acc$', r'\1:o:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):od:nom$', r'\1:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ohledně(:gen)?$', r'\1:ohledně:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:nom$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_vzor(:gen)?$', r'\1:po_vzoru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počátek(:gen)?$', r'\1:počátkem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počínat(:ins)?$', r'\1:počínaje:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_vliv(:gen)?$', r'\1:pod_vlivem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pomocí?(:gen)?$', r'\1:pomocí:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):postup(:gen)?$', r'\1:postupem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pro:nom$', r'\1:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):prostřednictvím?(:gen|:ins)?$', r'\1:prostřednictvím:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_během:gen$', r'\1:během:gen', edep['deprel']) # 'před a během utkání' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_po:loc$', r'\1:po:loc', edep['deprel']) # 'před a po vyloučení Schindlera' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):při_příležitost(:gen)?$', r'\1:při_příležitosti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s:nom$', r'\1:s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_na(:acc)?$', r'\1:s_ohledem_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_pomoc(:gen)?$', r'\1:s_pomocí:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_k(:dat)?$', r'\1:s_přihlédnutím_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_na(:acc)?$', r'\1:s_přihlédnutím_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_výjimka(:gen)?$', r'\1:s_výjimkou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_vyloučení(:gen)?$', r'\1:s_vyloučením:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_k(:dat)?$', r'\1:se_zřetelem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):severně_od(:gen)?$', r'\1:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_do(:gen)?$', r'\1:směrem_do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_k(:dat)?$', r'\1:směrem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_na(:acc)?$', r'\1:směrem_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_od(:gen)?$', r'\1:směrem_od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):společně_s(:ins)?$', r'\1:společně_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):spolu_s(:ins|:dat)?$', r'\1:spolu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):stranou(:gen|:dat)?$', r'\1:stranou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):u_příležitost(:gen)?$', r'\1:u_příležitosti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo(:gen)?$', r'\1:v_čele:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_duch(:gen)?$', r'\1:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_důsledek(:gen)?$', r'\1:v_důsledku:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_forma(:gen)?$', r'\1:ve_formě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_jméno(:gen)?$', r'\1:ve_jménu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_konfrontace_s(:ins)?$', r'\1:v_konfrontaci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast(:gen)?$', r'\1:v_oblasti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_obor(:gen)?$', r'\1:v_oboru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_otázka(:gen)?$', r'\1:v_otázce:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_podoba(:gen)?$', r'\1:v_podobě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_poměr_k(:dat)?$', r'\1:v_poměru_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_proces(:gen)?$', r'\1:v_procesu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_prospěch(:gen)?$', r'\1:ve_prospěch:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_protiklad_k(:dat)?$', r'\1:v_protikladu_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_průběh(:gen)?$', r'\1:v_průběhu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_případ(:gen)?$', r'\1:v_případě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rámec(:gen)?$', r'\1:v_rámci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rozpor_s(:ins)?$', r'\1:v_rozporu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_řada(:gen)?$', r'\1:v_řadě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_shoda_s(:ins)?$', r'\1:ve_shodě_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_služba(:gen)?$', r'\1:ve_službách:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_smysl(:gen)?$', r'\1:ve_smyslu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souhlas_s(:ins|:nom)?$', r'\1:v_souhlasu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_soulad_s(:ins|:nom)?$', r'\1:v_souladu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souvislost_s(:ins)?$', r'\1:v_souvislosti_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojení_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojený_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojitost_s(:ins)?$', r'\1:ve_spojitosti_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spolupráce_s(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_srovnání_s(:ins)?$', r'\1:ve_srovnání_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_světlo(:gen)?$', r'\1:ve_světle:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_věc(:gen)?$', r'\1:ve_věci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_vztah_k(:dat)?$', r'\1:ve_vztahu_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_zájem(:gen)?$', r'\1:v_zájmu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_záležitost(:gen)?$', r'\1:v_záležitosti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závěr(:gen)?$', r'\1:v_závěru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_na(:loc)?$', r'\1:v_závislosti_na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_znamení(:gen)?$', r'\1:ve_znamení:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vina(:gen)?$', r'\1:vinou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vliv(:gen)?$', r'\1:vlivem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vzhledem(_k)?(:dat)?$', r'\1:vzhledem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_důvod(:gen)?$', r'\1:z_důvodu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_hledisko(:gen)?$', r'\1:z_hlediska:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_řada(:gen)?$', r'\1:z_řad:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?z_strana(:gen)?$', r'\1:ze_strany:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_titul(:gen)?$', r'\1:z_titulu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_pomoc(:gen)?$', r'\1:za_pomoci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účel(:gen)?$', r'\1:za_účelem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):začátek(:gen)?$', r'\1:začátkem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):zásluha(:gen)?$', r'\1:zásluhou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závěr(:gen)?$', r'\1:závěrem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závisle_na(:loc)?$', r'\1:nezávisle_na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):že_za:gen$', r'\1:za:gen', edep['deprel']) From a0c56a1307426c46bc5974cca4e3f1341da5232c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 26 Dec 2021 22:17:12 +0100 Subject: [PATCH 075/871] Czech enhanced case markers. --- udapi/block/ud/cs/fixedeprels.py | 90 ++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 17 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index ddcdb6d3..2628f369 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -13,29 +13,26 @@ def process_node(self, node): """ for edep in node.deps: if re.match(r'^(acl|advcl):', edep['deprel']): - edep['deprel'] = re.sub(r'^(advcl):a_jestliže$', r'\1:jestliže', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):a_pokud$', r'\1:pokud', edep['deprel']) + # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). + edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|protože|teprve|zejména)_(aby|až|jestliže|když|pokud)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|pokud)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):abi$', r'\1:aby', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):ačkoliv$', r'\1:ačkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jak_aby$', r'\1:jak', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jak_ad$', r'\1:jak', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jakkoliv$', r'\1:jakkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jen_když$', r'\1:když', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jen_pokud$', r'\1:pokud', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):jakoby$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' + edep['deprel'] = re.sub(r'^(advcl):jelikož_do$', r'\1:jelikož', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jestli_že$', r'\1:jestliže', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jestliže_tedy$', r'\1:jestliže', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):k$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):kdykoliv$', r'\1:kdykoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):když_už$', r'\1:když', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' - edep['deprel'] = re.sub(r'^(advcl):například_když$', r'\1:když', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):od$', r'nmod:od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):podle$', r'obl:podle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):pokud_totiž$', r'\1:pokud', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):pokud_však$', r'\1:pokud', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):pro$', r'obl:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):protože_pokud$', r'\1:pokud', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):v$', r'nmod:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):v_případ_že$', r'\1:v_případě_že', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):v_duch$', r'obl:v_duchu:gen', edep['deprel']) @@ -89,6 +86,8 @@ def process_node(self, node): node.deprel = 'acl:relcl' elif re.match(r'^(nmod|obl(:arg)?):díky$', edep['deprel']): edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):dle$', edep['deprel']): + edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):do$', edep['deprel']): edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):k(:nom)?$', edep['deprel']): @@ -148,6 +147,16 @@ def process_node(self, node): edep['deprel'] += ':loc' elif re.match(r'^(nmod|obl(:arg)?):od$', edep['deprel']): edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): + # Annotation error. + if node.form == 's': + ohled = node.next_node + na = ohled.next_node + noun = na.next_node + self.set_basic_and_enhanced(noun, node.parent, 'obl', 'obl:s_ohledem_na:acc') + self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(node, noun, 'case', 'case') elif re.match(r'^(nmod|obl(:arg)?):okolo$', edep['deprel']): edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):oproti$', edep['deprel']): @@ -202,6 +211,18 @@ def process_node(self, node): edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):uvnitř$', edep['deprel']): edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': + # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. + # Find the content nominal. + cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] + vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] + if len(cnouns) > 0 and len(vs) > 0: + logging.info('I am here.') + cnoun = cnouns[0] + v = vs[0] + self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') + self.set_basic_and_enhanced(v, cnoun, 'case', 'case') + self.set_basic_and_enhanced(node, v, 'fixed', 'fixed') elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): # ':nom' occurs in 'karneval v Rio de Janeiro' edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) @@ -235,8 +256,11 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_v(:loc)?$', r'\1:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_k(:dat)?$', r'\1:bez_zřetele_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):blíž(:dat)?$', r'\1:blízko:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta:ins$', r'\1:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta(:gen)?$', r'\1:cestou:gen', edep['deprel']) @@ -246,7 +270,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):dík(:dat)?$', r'\1:díky:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do:nom$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do:(nom|dat)$', r'\1:do:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_k:dat$', r'\1:k:dat', edep['deprel']) # do maloobchodní sítě (nebo k dalšímu zpracování) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_rozpor_s(:ins)?$', r'\1:do_rozporu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_soulad_s(:ins)?$', r'\1:do_souladu_s:ins', edep['deprel']) @@ -258,6 +282,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby_pod:ins$', r'\1:pod:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) @@ -269,6 +294,8 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na(:gen|:nom)$', r'\1:na:acc', edep['deprel']) # 'odložit na 1. září' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_čelo(:gen)?$', r'\1:na_čele:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_mimo:loc$', r'\1:na:loc', edep['deprel']) # 'na kurtě i mimo něj' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_než:acc$', r'\1:na:acc', edep['deprel']) # 'na víc než čtyři a půl kilometru' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_podklad(:gen)?$', r'\1:na_podkladě:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_?rozdíl_od(:gen)?$', r'\1:na_rozdíl_od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úroveň(:gen)?$', r'\1:na_úrovni:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základ(:gen)?$', r'\1:na_základě:gen', edep['deprel']) @@ -281,26 +308,29 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_jako[_:].+$', r'\1:jako', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_o:acc$', r'\1:o:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):od:nom$', r'\1:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):od:(nom|dat)$', r'\1:od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ohledně(:gen)?$', r'\1:ohledně:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:nom$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_doba(:gen)?$', r'\1:po_dobu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_vzor(:gen)?$', r'\1:po_vzoru:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počátek(:gen)?$', r'\1:počátkem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počínat(:ins)?$', r'\1:počínaje:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_vliv(:gen)?$', r'\1:pod_vlivem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pomocí?(:gen)?$', r'\1:pomocí:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):postup(:gen)?$', r'\1:postupem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pro:nom$', r'\1:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pro:(nom|dat)$', r'\1:pro:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):prostřednictvím?(:gen|:ins)?$', r'\1:prostřednictvím:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):proti:nom$', r'\1:proti:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_během:gen$', r'\1:během:gen', edep['deprel']) # 'před a během utkání' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_po:loc$', r'\1:po:loc', edep['deprel']) # 'před a po vyloučení Schindlera' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):při_příležitost(:gen)?$', r'\1:při_příležitosti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s:nom$', r'\1:s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_k(:dat)?$', r'\1:s_ohledem_k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_na(:acc)?$', r'\1:s_ohledem_na:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_pomoc(:gen)?$', r'\1:s_pomocí:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_k(:dat)?$', r'\1:s_přihlédnutím_k:dat', edep['deprel']) @@ -308,6 +338,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_výjimka(:gen)?$', r'\1:s_výjimkou:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_vyloučení(:gen)?$', r'\1:s_vyloučením:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_k(:dat)?$', r'\1:se_zřetelem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_na(:acc)?$', r'\1:se_zřetelem_na:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):severně_od(:gen)?$', r'\1:od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_do(:gen)?$', r'\1:směrem_do:gen', edep['deprel']) @@ -315,18 +346,22 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_na(:acc)?$', r'\1:směrem_na:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_od(:gen)?$', r'\1:směrem_od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):společně_s(:ins)?$', r'\1:společně_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):spolu_s(:ins|:dat)?$', r'\1:spolu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):spolu(_s)?(:ins|:dat)?$', r'\1:spolu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):stranou(:gen|:dat)?$', r'\1:stranou:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):u_příležitost(:gen)?$', r'\1:u_příležitosti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_analogie_s(:ins)?$', r'\1:v_analogii_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo(:gen)?$', r'\1:v_čele:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo_s(:ins)?$', r'\1:v_čele_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_dohoda_s(:ins)?$', r'\1:v_dohodě_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_duch(:gen)?$', r'\1:v_duchu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_důsledek(:gen)?$', r'\1:v_důsledku:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_forma(:gen)?$', r'\1:ve_formě:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_jméno(:gen)?$', r'\1:ve_jménu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_konfrontace_s(:ins)?$', r'\1:v_konfrontaci_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast(:gen)?$', r'\1:v_oblasti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast_s(:ins)?$', r'\1:s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_obor(:gen)?$', r'\1:v_oboru:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_otázka(:gen)?$', r'\1:v_otázce:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_podoba(:gen)?$', r'\1:v_podobě:gen', edep['deprel']) @@ -342,6 +377,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_shoda_s(:ins)?$', r'\1:ve_shodě_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_služba(:gen)?$', r'\1:ve_službách:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_smysl(:gen)?$', r'\1:ve_smyslu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_součinnost_s(:ins|:nom)?$', r'\1:v_součinnosti_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souhlas_s(:ins|:nom)?$', r'\1:v_souhlasu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_soulad_s(:ins|:nom)?$', r'\1:v_souladu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souvislost_s(:ins)?$', r'\1:v_souvislosti_s:ins', edep['deprel']) @@ -353,7 +389,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_světlo(:gen)?$', r'\1:ve_světle:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_věc(:gen)?$', r'\1:ve_věci:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_vztah_k(:dat)?$', r'\1:ve_vztahu_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_zájem(:gen)?$', r'\1:v_zájmu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_zájem(:gen|:loc)?$', r'\1:v_zájmu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_záležitost(:gen)?$', r'\1:v_záležitosti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závěr(:gen)?$', r'\1:v_závěru:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_na(:loc)?$', r'\1:v_závislosti_na:loc', edep['deprel']) @@ -367,11 +403,13 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_důvod(:gen)?$', r'\1:z_důvodu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_hledisko(:gen)?$', r'\1:z_hlediska:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_oblast(:gen)?$', r'\1:z_oblasti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_řada(:gen)?$', r'\1:z_řad:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?z_strana(:gen)?$', r'\1:ze_strany:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_titul(:gen)?$', r'\1:z_titulu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_pomoc(:gen)?$', r'\1:za_pomoci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účast(:gen)?$', r'\1:za_účasti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účel(:gen)?$', r'\1:za_účelem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):začátek(:gen)?$', r'\1:začátkem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):zásluha(:gen)?$', r'\1:zásluhou:gen', edep['deprel']) @@ -379,3 +417,21 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závisle_na(:loc)?$', r'\1:nezávisle_na:loc', edep['deprel']) edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):že_za:gen$', r'\1:za:gen', edep['deprel']) + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) From a9720d16cb283eec919a4cdfa672085268665cb6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 27 Dec 2021 15:52:23 +0100 Subject: [PATCH 076/871] Czech enhanced case markers. --- udapi/block/ud/cs/fixedeprels.py | 50 +++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 2628f369..5a2e996d 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -14,28 +14,37 @@ def process_node(self, node): for edep in node.deps: if re.match(r'^(acl|advcl):', edep['deprel']): # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). - edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|protože|teprve|zejména)_(aby|až|jestliže|když|pokud)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|pokud)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):abi$', r'\1:aby', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):ačkoliv$', r'\1:ačkoli', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jak_aby$', r'\1:jak', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jak_ad$', r'\1:jak', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jakkoliv$', r'\1:jakkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):jako_kupříkladu$', r'\1:jako', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):jakoby$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' edep['deprel'] = re.sub(r'^(advcl):jelikož_do$', r'\1:jelikož', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jestli_že$', r'\1:jestliže', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):k$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):kdykoliv$', r'\1:kdykoli', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:k$', r'obl:k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):kdykoliv$', r'\1:kdykoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):liž$', r'\1:li', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' + edep['deprel'] = re.sub(r'^acl:na_způsob$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):od$', r'nmod:od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):podle$', r'obl:podle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):pro$', r'obl:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:pro$', r'obl:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):takže_a$', r'\1:takže', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):v$', r'nmod:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):v_případ_že$', r'\1:v_případě_že', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):v_duch$', r'obl:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v_duch$', r'obl:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):zatím_co$', r'\1:zatímco', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):že_ať$', r'\1:ať', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):že_jako$', r'\1:že', edep['deprel']) if edep['deprel'] == 'acl:v' and node.form == 'patře': @@ -257,9 +266,12 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_do(:gen)?$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_o(:acc)?$', r'\1:o:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_k(:dat)?$', r'\1:bez_zřetele_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_na(:acc)?$', r'\1:bez_zřetele_na:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):blíž(:dat)?$', r'\1:blízko:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta:ins$', r'\1:ins', edep['deprel']) @@ -272,6 +284,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):dík(:dat)?$', r'\1:díky:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do:(nom|dat)$', r'\1:do:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_k:dat$', r'\1:k:dat', edep['deprel']) # do maloobchodní sítě (nebo k dalšímu zpracování) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_oblast(:gen)?$', r'\1:do_oblasti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_rozpor_s(:ins)?$', r'\1:do_rozporu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_soulad_s(:ins)?$', r'\1:do_souladu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France @@ -286,19 +299,25 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby_pod:ins$', r'\1:pod:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k(:gen)?$', r'\1:k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k_konec(:gen)?$', r'\1:ke_konci:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):kol(em)?(:gen)?$', r'\1:kolem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):konec(:gen)?$', r'\1:koncem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi_uvnitř:gen$', r'\1:uvnitř:gen', edep['deprel']) # 'nejdou mezi, ale uvnitř odvětví a oborů' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na(:gen|:nom)$', r'\1:na:acc', edep['deprel']) # 'odložit na 1. září' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_báze(:gen)?$', r'\1:na_bázi:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_čelo(:gen)?$', r'\1:na_čele:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_mimo:loc$', r'\1:na:loc', edep['deprel']) # 'na kurtě i mimo něj' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_než:acc$', r'\1:na:acc', edep['deprel']) # 'na víc než čtyři a půl kilometru' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_od:acc$', r'\1:na_rozdíl_od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_podklad(:gen)?$', r'\1:na_podkladě:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_?rozdíl_od(:gen)?$', r'\1:na_rozdíl_od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_újma(:gen)?$', r'\1:gen', edep['deprel']) # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úroveň(:gen)?$', r'\1:na_úrovni:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úsek(:gen)?$', r'\1:na_úseku:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základ(:gen)?$', r'\1:na_základě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základna(:gen)?$', r'\1:na_základně:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_závěr(:gen)?$', r'\1:na_závěr:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):namísto_do(:gen)?$', r'\1:do:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):následek(:gen)?$', r'\1:následkem:gen', edep['deprel']) @@ -312,14 +331,17 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ohledně(:gen)?$', r'\1:ohledně:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:nom$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_v:loc$', r'\1:po:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_doba(:gen)?$', r'\1:po_dobu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_vzor(:gen)?$', r'\1:po_vzoru:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počátek(:gen)?$', r'\1:počátkem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počínat(:ins)?$', r'\1:počínaje:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_dojem(:gen)?$', r'\1:pod_dojmem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_vliv(:gen)?$', r'\1:pod_vlivem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pomocí?(:gen)?$', r'\1:pomocí:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):postup(:gen)?$', r'\1:postupem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pouze_v(:loc)?$', r'\1:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pro:(nom|dat)$', r'\1:pro:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):prostřednictvím?(:gen|:ins)?$', r'\1:prostřednictvím:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):proti:nom$', r'\1:proti:dat', edep['deprel']) @@ -327,6 +349,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_během:gen$', r'\1:během:gen', edep['deprel']) # 'před a během utkání' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_po:loc$', r'\1:po:loc', edep['deprel']) # 'před a po vyloučení Schindlera' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přes:gen$', r'\1:přes:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):při_příležitost(:gen)?$', r'\1:při_příležitosti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! @@ -348,6 +371,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):společně_s(:ins)?$', r'\1:společně_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):spolu(_s)?(:ins|:dat)?$', r'\1:spolu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):stranou(:gen|:dat)?$', r'\1:stranou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):takže[_:].+$', r'\1:takže', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):u_příležitost(:gen)?$', r'\1:u_příležitosti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) @@ -359,7 +383,11 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_důsledek(:gen)?$', r'\1:v_důsledku:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_forma(:gen)?$', r'\1:ve_formě:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_jméno(:gen)?$', r'\1:ve_jménu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_k:dat$', r'\1:k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kombinace_s(:ins)?$', r'\1:v_kombinaci_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_konfrontace_s(:ins)?$', r'\1:v_konfrontaci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kontext_s(:ins)?$', r'\1:v_kontextu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_na:loc$', r'\1:na:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast(:gen)?$', r'\1:v_oblasti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast_s(:ins)?$', r'\1:s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_obor(:gen)?$', r'\1:v_oboru:gen', edep['deprel']) @@ -376,6 +404,8 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_řada(:gen)?$', r'\1:v_řadě:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_shoda_s(:ins)?$', r'\1:ve_shodě_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_služba(:gen)?$', r'\1:ve_službách:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr(:gen)?$', r'\1:ve_směru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr_k(:dat)?$', r'\1:ve_směru_k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_smysl(:gen)?$', r'\1:ve_smyslu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_součinnost_s(:ins|:nom)?$', r'\1:v_součinnosti_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souhlas_s(:ins|:nom)?$', r'\1:v_souhlasu_s:ins', edep['deprel']) @@ -385,7 +415,8 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojený_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojitost_s(:ins)?$', r'\1:ve_spojitosti_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spolupráce_s(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_srovnání_s(:ins)?$', r'\1:ve_srovnání_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_s_spolupráce(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_srovnání_se?(:ins)?$', r'\1:ve_srovnání_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_světlo(:gen)?$', r'\1:ve_světle:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_věc(:gen)?$', r'\1:ve_věci:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_vztah_k(:dat)?$', r'\1:ve_vztahu_k:dat', edep['deprel']) @@ -393,6 +424,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_záležitost(:gen)?$', r'\1:v_záležitosti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závěr(:gen)?$', r'\1:v_závěru:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_na(:loc)?$', r'\1:v_závislosti_na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_s(:ins)?$', r'\1:v_závislosti_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_znamení(:gen)?$', r'\1:ve_znamení:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vina(:gen)?$', r'\1:vinou:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vliv(:gen)?$', r'\1:vlivem:gen', edep['deprel']) @@ -401,11 +433,13 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vzhledem(_k)?(:dat)?$', r'\1:vzhledem_k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_důvod(:gen)?$', r'\1:z_důvodu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_hledisko(:gen)?$', r'\1:z_hlediska:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_hledisko(:gen|:nom)?$', r'\1:z_hlediska:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_oblast(:gen)?$', r'\1:z_oblasti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_řada(:gen)?$', r'\1:z_řad:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?z_strana(:gen)?$', r'\1:ze_strany:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_nedostatek(:gen)?$', r'\1:z_nedostatku:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_titul(:gen)?$', r'\1:z_titulu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_pomoc(:gen)?$', r'\1:za_pomoci:gen', edep['deprel']) From ba230a6b92cea415606a6f6acb59fc0f2793100e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 3 Jan 2022 15:15:53 +0100 Subject: [PATCH 077/871] adding `node.siblings` Originally, we decided it is not worth of introducing and bloating the API, but now I saw a usecase for `node.siblings(preceding_only=True)` (used in a list comprehension, and I think nested list comprehensions are evil). --- udapi/core/node.py | 19 +++++++++++++++++-- udapi/core/tests/test_node.py | 2 ++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 5225724e..3d120a52 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -357,10 +357,25 @@ def children(self): nodes2 = [n for n in node.children if n.ord > node.ord] nodes3 = [n for n in node.children if n.ord < node.ord] nodes4 = [n for n in node.children if n.ord < node.ord] + [node] - See documentation of ListOfNodes for details. + See the documentation of ListOfNodes for details. """ return ListOfNodes(self._children, origin=self) + @property + def siblings(self): + """Return a list of dependency sibling nodes. + + When used as a property, `node.siblings` is just a shortcut for: + [n for n in node.parent.children if n!=node] + However, it is especially helpful when used as a method, + so e.g. `node.siblings(preceding_only=True)` stands for + [n for n in node.parent.children if n.ord < node.ord] + which is something else than + node.parent.children(preceding_only=True). + See the documentation of ListOfNodes for details. + """ + return ListOfNodes([n for n in self._parent._children if n!=self], origin=self) + @property def descendants(self): """Return a list of all descendants of the current node. @@ -380,7 +395,7 @@ def descendants(self): nodes2 = [n for n in node.descendants if n.ord > node.ord] nodes3 = [n for n in node.descendants if n.ord < node.ord] nodes4 = [n for n in node.descendants if n.ord < node.ord] + [node] - See documentation of ListOfNodes for details. + See the documentation of ListOfNodes for details. """ # The following code is equivalent to # ListOfNodes(sorted(self.unordered_descendants()), origin=self) diff --git a/udapi/core/tests/test_node.py b/udapi/core/tests/test_node.py index f38ca585..28a45d85 100755 --- a/udapi/core/tests/test_node.py +++ b/udapi/core/tests/test_node.py @@ -36,6 +36,8 @@ def test_topology(self): self.assertEqual(len(nodes[1].children), 3) self.assertEqual(len(nodes[1].children(add_self=True)), 4) self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)), 3) + self.assertEqual(nodes[2].siblings, [nodes[0], nodes[3]]) + self.assertEqual(nodes[2].siblings(following_only=True), [nodes[3]]) self.assertEqual(nodes[0].next_node, nodes[1]) self.assertEqual(nodes[2].prev_node, nodes[1]) From d69299ecc6ea0cc8be53fb4ef240665c527caf61 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 5 Jan 2022 18:45:51 +0100 Subject: [PATCH 078/871] fix ZeroDivisionError --- udapi/block/corefud/stats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py index f07c2a27..e39195db 100644 --- a/udapi/block/corefud/stats.py +++ b/udapi/block/corefud/stats.py @@ -76,7 +76,7 @@ def process_end(self): columns += [('clusters', f"{self.clusters:7,}"), ('clusters_per1k', f"{1000 * self.clusters / total_nodes_nonzero:6.0f}"), ('longest_cluster', f"{self.longest_cluster:6}"), - ('avg_cluster', f"{self.counter['c_total_len'] / self.clusters:5.1f}")] + ('avg_cluster', f"{self.counter['c_total_len'] / clusters_nonzero:5.1f}")] for i in range(1, self.c_len_max + 1): percent = 100 * self.counter[f"c_len_{i}"] / clusters_nonzero columns.append((f"c_len_{i}{'' if i < self.c_len_max else '+'}", f"{percent:5.1f}")) @@ -84,7 +84,7 @@ def process_end(self): columns += [('mentions', f"{self.mentions:7,}"), ('mentions_per1k', f"{1000 * self.mentions / total_nodes_nonzero:6.0f}"), ('longest_mention', f"{self.longest_mention:6}"), - ('avg_mention', f"{self.counter['m_total_len'] / self.mentions:5.1f}")] + ('avg_mention', f"{self.counter['m_total_len'] / mentions_nonzero:5.1f}")] for i in range(0, self.m_len_max + 1): percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}")) From 3d6a267123de3e505cee6a6821ffd028d8725615 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 5 Jan 2022 19:00:07 +0100 Subject: [PATCH 079/871] read.OldCorefUD and write.CorefUD for the old CorefUD 0.1 format --- udapi/block/read/oldcorefud.py | 81 +++++++++++++++++++++++++++++++++ udapi/block/write/oldcorefud.py | 58 +++++++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 udapi/block/read/oldcorefud.py create mode 100644 udapi/block/write/oldcorefud.py diff --git a/udapi/block/read/oldcorefud.py b/udapi/block/read/oldcorefud.py new file mode 100644 index 00000000..a7bc3101 --- /dev/null +++ b/udapi/block/read/oldcorefud.py @@ -0,0 +1,81 @@ +"""Reader for CoNLL-U files with the old CorefUD 0.1 style of coreference annotation.""" +import re +import logging +import udapi.block.read.conllu +from udapi.core.coref import CorefCluster, CorefMention, BridgingLinks + +class OldCorefUD(udapi.block.read.conllu.Conllu): + + def process_document(self, doc, strict=True): + super().process_document(doc) + + clusters = {} + for node in doc.nodes_and_empty: + index, index_str = 0, "" + cluster_id = node.misc["ClusterId"] + if not cluster_id: + index, index_str = 1, "[1]" + cluster_id = node.misc["ClusterId[1]"] + while cluster_id: + cluster = clusters.get(cluster_id) + if cluster is None: + cluster = CorefCluster(cluster_id) + clusters[cluster_id] = cluster + mention = CorefMention(node, cluster) + if node.misc["MentionSpan" + index_str]: + mention.span = node.misc["MentionSpan" + index_str] + else: + mention.words = [node] + cluster_type = node.misc["ClusterType" + index_str] + if cluster_type is not None: + if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: + logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") + cluster.cluster_type = cluster_type + + bridging_str = node.misc["Bridging" + index_str] + if bridging_str: + mention._bridging = BridgingLinks(mention, bridging_str, clusters, strict) + + split_ante_str = node.misc["SplitAnte" + index_str] + if split_ante_str: + split_antes = [] + # TODO in CorefUD draft "+" was used as the separator, but it was changed to comma. + # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator. + for ante_str in split_ante_str.replace('+', ',').split(','): + if ante_str in clusters: + if ante_str == cluster_id: + _error("SplitAnte cannot self-reference the same cluster: " + cluster_id, strict) + split_antes.append(clusters[ante_str]) + else: + # split cataphora, e.g. "We, that is you and me..." + ante_cl = CorefCluster(ante_str) + clusters[ante_str] = ante_cl + split_antes.append(ante_cl) + cluster.split_ante = sorted(split_antes) + + mention.misc = node.misc["MentionMisc" + index_str] + index += 1 + index_str = f"[{index}]" + cluster_id = node.misc["ClusterId" + index_str] + # c=doc.coref_clusters should be sorted, so that c[0] < c[1] etc. + # In other words, the dict should be sorted by the values (according to CorefCluster.__lt__), + # not by the keys (cluster_id). + # In Python 3.7+ (3.6+ in CPython), dicts are guaranteed to be insertion order. + for cluster in clusters.values(): + if not cluster._mentions: + _error(f"Cluster {cluster.cluster_id} referenced in SplitAnte or Bridging, but not defined with ClusterId", strict) + cluster._mentions.sort() + doc._coref_clusters = {c._cluster_id: c for c in sorted(clusters.values())} + + # Delete all old-style attributes from MISC (so when converting old to new style, the old attributes are deleted). + attrs = "ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() + for node in doc.nodes_and_empty: + for key in list(node.misc): + if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): + del node.misc[key] + + +def _error(msg, strict): + if strict: + raise ValueError(msg) + logging.error(msg) diff --git a/udapi/block/write/oldcorefud.py b/udapi/block/write/oldcorefud.py new file mode 100644 index 00000000..c6c38cbe --- /dev/null +++ b/udapi/block/write/oldcorefud.py @@ -0,0 +1,58 @@ +"""Writer for CoNLL-U files with the old CorefUD 0.1 style of coreference annotation.""" +import re +import logging +import udapi.block.write.conllu + +class OldCorefUD(udapi.block.write.conllu.Conllu): + + def process_document(self, doc): + if not doc._coref_clusters: + logging.warning("Using write.OldCorefUD on a document without any coreference annotation") + doc._coref_clusters = {} + + # Delete both new-style (GUM-style) and old-style (CorefUD 0.1) coreference annotations from MISC. + attrs = "Entity Split Bridge ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() + for node in doc.nodes_and_empty: + for key in list(node.misc): + if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): + del node.misc[key] + + # doc._coref_clusters is a dict, which is insertion ordered in Python 3.7+. + # The insertion order is sorted according to CorefCluster.__lt__ (see few lines above). + # However, new clusters could be added meanwhile or some clusters edited, + # so we need to sort the clusters again before storing to MISC. + # We also need to mare sure cluster.mentions are sorted in each cluster + # because the ordering of clusters is defined by the first mention in each cluster. + # Ordering of mentions within a cluster can be changed when e.g. changing the span + # of a given mention or reordering words within a sentence and in such events + # Udapi currently does not automatically update the ordering of clusters. + for cluster in doc._coref_clusters.values(): + cluster._mentions.sort() + for cluster in sorted(doc._coref_clusters.values()): + for mention in cluster.mentions: + head = mention.head + if head.misc["ClusterId"]: + for a in attrs: + if head.misc[a]: + head.misc[a + "[1]"] = head.misc[a] + del head.misc[a] + index_str = "[2]" + else: + index, index_str = 1, "[1]" + while(head.misc["ClusterId" + index_str]): + index += 1 + index_str = f"[{index}]" + if index == 1: + index_str = "" + head.misc["ClusterId" + index_str] = cluster.cluster_id + head.misc["MentionSpan" + index_str] = mention.span + head.misc["ClusterType" + index_str] = cluster.cluster_type + if mention._bridging: + head.misc["Bridging" + index_str] = str(mention.bridging) + if cluster.split_ante: + serialized = ','.join((c.cluster_id for c in sorted(cluster.split_ante))) + head.misc["SplitAnte" + index_str] = serialized + if mention.misc: + head.misc["MentionMisc" + index_str] = mention.misc + + super().process_document(doc) From d37dd68feda69ba5d1b52de5e146bb974ee4be95 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 6 Jan 2022 22:17:51 +0100 Subject: [PATCH 080/871] util.Eval coref_cluster='...' coref_mention='...' We could use `util.Eval doc='for c in doc.coref_clusters.values():...'`, but it was difficult to fit a bit more difficult code into such oneline. So I've added these two new parameters and now we can write e.g. udapy \ util.Eval coref_cluster='print($.cluster_id)' \ coref_mention='print(" ".join(w.form for w in $.words)) --- udapi/block/util/eval.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/udapi/block/util/eval.py b/udapi/block/util/eval.py index b814b80d..07eab681 100644 --- a/udapi/block/util/eval.py +++ b/udapi/block/util/eval.py @@ -29,6 +29,7 @@ class Eval(Block): # pylint: disable=too-many-arguments,too-many-instance-attributes def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end=None, before_doc=None, after_doc=None, before_bundle=None, after_bundle=None, + coref_mention=None, coref_cluster=None, expand_code=True, **kwargs): super().__init__(**kwargs) self.doc = doc @@ -41,6 +42,8 @@ def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end= self.after_doc = after_doc self.before_bundle = before_bundle self.after_bundle = after_bundle + self.coref_mention = coref_mention + self.coref_cluster = coref_cluster self.expand_code = expand_code self.count = collections.Counter() @@ -71,6 +74,16 @@ def process_document(self, document): # TODO if self._should_process_bundle(bundle): self.process_bundle(bundle) + if self.coref_cluster or self.coref_mention: + for cluster in doc.coref_clusters.values(): + if self.coref_cluster: + this = cluster + exec(self.expand_eval_code(self.coref_cluster)) + if self.coref_mention: + for mention in cluster.mentions: + this = mention + exec(self.expand_eval_code(self.coref_mention)) + def process_bundle(self, bundle): # Extract variables, so they can be used in eval code document = doc = bundle.document From f7f665e6e6868481f4f06d77b08072a134368e30 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 10 Jan 2022 13:50:47 +0100 Subject: [PATCH 081/871] Same block as for Czech: fix case-enhanced deprels in Slovak. --- udapi/block/ud/sk/fixedeprels.py | 537 +++++++++++++++++++++++++++++++ 1 file changed, 537 insertions(+) create mode 100644 udapi/block/ud/sk/fixedeprels.py diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py new file mode 100644 index 00000000..6144a29b --- /dev/null +++ b/udapi/block/ud/sk/fixedeprels.py @@ -0,0 +1,537 @@ +"""Block to fix case-enhanced dependency relations in Slovak.""" +from udapi.core.block import Block +import logging +import re + +class FixEdeprels(Block): + + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):aby_na:loc$', r'\1:na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_v(:loc)?$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_do(:gen)?$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_o(:acc)?$', r'\1:o:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_k(:dat)?$', r'\1:bez_zřetele_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_na(:acc)?$', r'\1:bez_zřetele_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):blíž(:dat)?$', r'\1:blízko:dat', edep['deprel']) + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'do': 'do:gen', + 'k': 'k:dat', + 'mimo': 'mimo:gen', + 'na_rozdiel_od': 'na_rozdiel_od:gen', + 'na_základ': 'na_základe:gen', + 'pomoc': 'pomocou:gen', + 'pre': 'pre:acc', + 'prostredníctvom': 'prostredníctvom:gen', + 's': 's:ins', + 's_dôraz_na': 's_dôrazom_na:acc', + 's_ohľad_na': 's_ohľadom_na:acc', + 's_pomoc': 's_pomocou:gen', + 'smer_k': 'smerom_k:dat', + 'spoločne_s': 'spoločne_s:ins', + 'spolu_s': 'spolu_s:ins', + 'v_dôsledok': 'v_dôsledku:gen', + 'v_meno': 'v_mene:gen', + 'v_oblasť': 'v_oblasti:gen', + 'v_porovnanie_s': 'v_porovnaniu_s:ins', + 'v_priebeh': 'v_priebehu:gen', + 'v_prípad': 'v_prípade:gen', + 'v_prospech': 'v_prospech:gen', + 'v_rámec': 'v_rámci:gen', + 'v_spolupráca_s': 'v_spolupráci_s:ins', + 'v_súlad_s': 'v_súlade_s:ins', + 'v_súvislosť_s': 'v_súvislosti_s:ins', + 'v_ústrety': 'v_ústrety:dat', + 'v_vzťah_k': 'vo_vzťahu_k:dat', + 'v_závislosť_na': 'v_závislosti_na:loc', + 'vzhľad_na': 'vzhľadom_na:acc', + 'z': 'z:gen', + 'z_hľadisko': 'z_hľadiska:gen', + 'začiatkom': 'začiatkom:gen' + } + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Slovak basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + for x, xnorm in unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(0)+':'+xnorm + break + if re.match(r'^(acl|advcl):', edep['deprel']): + # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). + edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):abi$', r'\1:aby', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):ačkoliv$', r'\1:ačkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jak_aby$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jak_ad$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jakkoliv$', r'\1:jakkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):jako_kupříkladu$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):jakoby$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' + edep['deprel'] = re.sub(r'^(advcl):jelikož_do$', r'\1:jelikož', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jestli_že$', r'\1:jestliže', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):k$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:k$', r'obl:k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):kdykoliv$', r'\1:kdykoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):liž$', r'\1:li', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' + edep['deprel'] = re.sub(r'^acl:na_způsob$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' + edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating + edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):od$', r'nmod:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):podle$', r'obl:podle:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:pro$', r'obl:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):takže_a$', r'\1:takže', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):v$', r'nmod:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):v_případ_že$', r'\1:v_případě_že', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v_duch$', r'obl:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):zatím_co$', r'\1:zatímco', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):že_ať$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):že_jako$', r'\1:že', edep['deprel']) + if edep['deprel'] == 'acl:v' and node.form == 'patře': + edep['deprel'] = 'nmod:v:loc' + node.deprel = 'nmod' + node.lemma = 'patro' + node.upos = 'NOUN' + node.xpos = 'NNNS6-----A----' + node.feats['Aspect'] = '' + node.feats['Gender'] = 'Neut' + node.feats['Tense'] = '' + node.feats['VerbForm'] = '' + node.feats['Voice'] = '' + edep['deprel'] = re.sub(r'^advcl:(od|do)$', r'obl:\1:gen', edep['deprel']) + elif re.match(r'^(nmod|obl):', edep['deprel']): + if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': + # This is a same-case noun-noun modifier, which just happens to be in the locative. + # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has + # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. + edep['deprel'] = 'nmod' + elif edep['deprel'] == 'obl:loc': + # Annotation error. The first occurrence in PDT dev: + # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' + # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. + # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. + edep['deprel'] = 'obl:v:loc' + elif edep['deprel'] == 'obl:arg:loc': + # Annotation error. The first occurrence in PDT dev: + edep['deprel'] = 'obl:arg:na:loc' + elif edep['deprel'] == 'nmod:loc': + # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': + # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? + # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. + edep['deprel'] = 'obl' + elif edep['deprel'] == 'nmod:voc': + # 'v 8. čísle tiskoviny Ty rudá krávo' + edep['deprel'] = 'nmod:nom' + elif re.match(r'^(nmod|obl(:arg)?):během$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):bez$', edep['deprel']): + edep['deprel'] += ':gen' + elif edep['deprel'] == 'nmod:co:nom': + # Annotation error: 'kompatibilní znamená tolik co slučitelný' + # 'co' should be relative pronoun rather than subordinating conjunction. + edep['deprel'] = 'acl:relcl' + node.deprel = 'acl:relcl' + elif re.match(r'^(nmod|obl(:arg)?):díky$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):dle$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):do$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):k(:nom)?$', edep['deprel']): + edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + ':dat' + elif re.match(r'^(nmod|obl(:arg)?):kolem$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):kromě$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): + edep['deprel'] = 'advcl:li' + elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):mimo$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):místo$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^obl:místo_za:acc$', edep['deprel']): + # 'chytají krávu místo za rohy spíše za ocas' + # This should be treated as coordination; 'místo' and 'spíše' are adverbs (???); 'case' for 'místo' does not seem to be the optimal solution. + for c in node.children: + if c.form == 'místo': + c.upos = 'ADV' + c.deprel = 'cc' + edep['deprel'] = 'obl:za:acc' + elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) + elif re.match(r'^(nmod|obl(:arg)?):na$', edep['deprel']): + # The case is unknown. We need 'acc' or 'loc'. + # The locative is probably more frequent but it is not so likely with every noun. + # If there is an nummod:gov child, it must be accusative and not locative. + # (The case would be taken from the number but if it is expressed as digits, it does not have the case feature.) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + elif re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:arg:na_konec$', edep['deprel']): + # Annotation error. It should have been two prepositional phrases: 'snížil na 225 tisíc koncem minulého roku' + edep['deprel'] = 'obl:arg:na:acc' + elif re.match(r'^(nmod|obl(:arg)?):nad$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):namísto$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):navzdory$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):od$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): + # Annotation error. + if node.form == 's': + ohled = node.next_node + na = ohled.next_node + noun = na.next_node + self.set_basic_and_enhanced(noun, node.parent, 'obl', 'obl:s_ohledem_na:acc') + self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(node, noun, 'case', 'case') + elif re.match(r'^(nmod|obl(:arg)?):okolo$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):oproti$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^nmod:pára:nom$', edep['deprel']): + # Annotation error: 'par excellence'. + edep['deprel'] = 'nmod' + for c in node.children: + if c.udeprel == 'case' and c.form.lower() == 'par': + c.lemma = 'par' + c.upos = 'ADP' + c.xpos = 'RR--X----------' + c.feats['Case'] = '' + c.feats['Gender'] = '' + c.feats['Number'] = '' + c.feats['Polarity'] = '' + c.feats['AdpType'] = 'Prep' + elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): + ###!!! Taky bychom se mohli dívat do XPOS předložky, protože tam bude pád uveden! + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):poblíž$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):podle$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):pro$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):proti$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): + # Accusative would be possible but unlikely. + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):přes$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):při$', edep['deprel']): + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): + # Genitive would be possible but unlikely. + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):skrz$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):u$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):uprostřed$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):uvnitř$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': + # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. + # Find the content nominal. + cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] + vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] + if len(cnouns) > 0 and len(vs) > 0: + logging.info('I am here.') + cnoun = cnouns[0] + v = vs[0] + self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') + self.set_basic_and_enhanced(v, cnoun, 'case', 'case') + self.set_basic_and_enhanced(node, v, 'fixed', 'fixed') + elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): + # ':nom' occurs in 'karneval v Rio de Janeiro' + edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:v_čel[eo]_s:ins$', edep['deprel']): + # There is just one occurrence and it is an error: + # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' + # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. + edep['deprel'] = 'obl:s:ins' + elif re.match(r'^(nmod|obl(:arg)?):včetně$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):vedle$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):vůči$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):z$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): + # Instrumental would be possible but unlikely. + edep['deprel'] += ':acc' + else: + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):aby_na:loc$', r'\1:na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_v(:loc)?$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_do(:gen)?$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_o(:acc)?$', r'\1:o:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_k(:dat)?$', r'\1:bez_zřetele_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_na(:acc)?$', r'\1:bez_zřetele_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):blíž(:dat)?$', r'\1:blízko:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta:ins$', r'\1:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta(:gen)?$', r'\1:cestou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko(:nom)?$', r'\1:nedaleko:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko_od(:gen)?$', r'\1:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):dík(:dat)?$', r'\1:díky:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do:(nom|dat)$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_k:dat$', r'\1:k:dat', edep['deprel']) # do maloobchodní sítě (nebo k dalšímu zpracování) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_oblast(:gen)?$', r'\1:do_oblasti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_rozpor_s(:ins)?$', r'\1:do_rozporu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_soulad_s(:ins)?$', r'\1:do_souladu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?forma(:gen)?$', r'\1:formou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby_pod:ins$', r'\1:pod:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k(:gen)?$', r'\1:k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k_konec(:gen)?$', r'\1:ke_konci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):kol(em)?(:gen)?$', r'\1:kolem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):konec(:gen)?$', r'\1:koncem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi_uvnitř:gen$', r'\1:uvnitř:gen', edep['deprel']) # 'nejdou mezi, ale uvnitř odvětví a oborů' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na(:gen|:nom)$', r'\1:na:acc', edep['deprel']) # 'odložit na 1. září' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_báze(:gen)?$', r'\1:na_bázi:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_čelo(:gen)?$', r'\1:na_čele:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_mimo:loc$', r'\1:na:loc', edep['deprel']) # 'na kurtě i mimo něj' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_než:acc$', r'\1:na:acc', edep['deprel']) # 'na víc než čtyři a půl kilometru' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_od:acc$', r'\1:na_rozdíl_od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_podklad(:gen)?$', r'\1:na_podkladě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_?rozdíl_od(:gen)?$', r'\1:na_rozdíl_od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_újma(:gen)?$', r'\1:gen', edep['deprel']) # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úroveň(:gen)?$', r'\1:na_úrovni:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úsek(:gen)?$', r'\1:na_úseku:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základ(:gen)?$', r'\1:na_základě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základna(:gen)?$', r'\1:na_základně:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_závěr(:gen)?$', r'\1:na_závěr:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):namísto_do(:gen)?$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):následek(:gen)?$', r'\1:následkem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ne)?daleko(:gen)?$', r'\1:nedaleko:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):než[_:].+$', r'\1:než', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):nežli[_:].+$', r'\1:nežli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_jako[_:].+$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_o:acc$', r'\1:o:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):od:(nom|dat)$', r'\1:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ohledně(:gen)?$', r'\1:ohledně:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_v:loc$', r'\1:po:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_doba(:gen)?$', r'\1:po_dobu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_vzor(:gen)?$', r'\1:po_vzoru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počátek(:gen)?$', r'\1:počátkem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počínat(:ins)?$', r'\1:počínaje:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_dojem(:gen)?$', r'\1:pod_dojmem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_vliv(:gen)?$', r'\1:pod_vlivem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pomocí?(:gen)?$', r'\1:pomocí:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):postup(:gen)?$', r'\1:postupem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pouze_v(:loc)?$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pro:(nom|dat)$', r'\1:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):prostřednictvím?(:gen|:ins)?$', r'\1:prostřednictvím:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):proti:nom$', r'\1:proti:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_během:gen$', r'\1:během:gen', edep['deprel']) # 'před a během utkání' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_po:loc$', r'\1:po:loc', edep['deprel']) # 'před a po vyloučení Schindlera' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přes:gen$', r'\1:přes:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):při_příležitost(:gen)?$', r'\1:při_příležitosti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_k(:dat)?$', r'\1:s_ohledem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_na(:acc)?$', r'\1:s_ohledem_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_pomoc(:gen)?$', r'\1:s_pomocí:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_k(:dat)?$', r'\1:s_přihlédnutím_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_na(:acc)?$', r'\1:s_přihlédnutím_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_výjimka(:gen)?$', r'\1:s_výjimkou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_vyloučení(:gen)?$', r'\1:s_vyloučením:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_k(:dat)?$', r'\1:se_zřetelem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_na(:acc)?$', r'\1:se_zřetelem_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):severně_od(:gen)?$', r'\1:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_do(:gen)?$', r'\1:směrem_do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_k(:dat)?$', r'\1:směrem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_na(:acc)?$', r'\1:směrem_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_od(:gen)?$', r'\1:směrem_od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):společně_s(:ins)?$', r'\1:společně_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):spolu(_s)?(:ins|:dat)?$', r'\1:spolu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):stranou(:gen|:dat)?$', r'\1:stranou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):takže[_:].+$', r'\1:takže', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):u_příležitost(:gen)?$', r'\1:u_příležitosti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_analogie_s(:ins)?$', r'\1:v_analogii_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo(:gen)?$', r'\1:v_čele:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo_s(:ins)?$', r'\1:v_čele_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_dohoda_s(:ins)?$', r'\1:v_dohodě_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_duch(:gen)?$', r'\1:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_důsledek(:gen)?$', r'\1:v_důsledku:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_forma(:gen)?$', r'\1:ve_formě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_jméno(:gen)?$', r'\1:ve_jménu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_k:dat$', r'\1:k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kombinace_s(:ins)?$', r'\1:v_kombinaci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_konfrontace_s(:ins)?$', r'\1:v_konfrontaci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kontext_s(:ins)?$', r'\1:v_kontextu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_na:loc$', r'\1:na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast(:gen)?$', r'\1:v_oblasti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast_s(:ins)?$', r'\1:s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_obor(:gen)?$', r'\1:v_oboru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_otázka(:gen)?$', r'\1:v_otázce:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_podoba(:gen)?$', r'\1:v_podobě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_poměr_k(:dat)?$', r'\1:v_poměru_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_proces(:gen)?$', r'\1:v_procesu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_prospěch(:gen)?$', r'\1:ve_prospěch:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_protiklad_k(:dat)?$', r'\1:v_protikladu_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_průběh(:gen)?$', r'\1:v_průběhu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_případ(:gen)?$', r'\1:v_případě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rámec(:gen)?$', r'\1:v_rámci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rozpor_s(:ins)?$', r'\1:v_rozporu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_řada(:gen)?$', r'\1:v_řadě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_shoda_s(:ins)?$', r'\1:ve_shodě_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_služba(:gen)?$', r'\1:ve_službách:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr(:gen)?$', r'\1:ve_směru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr_k(:dat)?$', r'\1:ve_směru_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_smysl(:gen)?$', r'\1:ve_smyslu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_součinnost_s(:ins|:nom)?$', r'\1:v_součinnosti_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souhlas_s(:ins|:nom)?$', r'\1:v_souhlasu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_soulad_s(:ins|:nom)?$', r'\1:v_souladu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souvislost_s(:ins)?$', r'\1:v_souvislosti_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojení_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojený_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojitost_s(:ins)?$', r'\1:ve_spojitosti_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spolupráce_s(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_s_spolupráce(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_srovnání_se?(:ins)?$', r'\1:ve_srovnání_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_světlo(:gen)?$', r'\1:ve_světle:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_věc(:gen)?$', r'\1:ve_věci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_vztah_k(:dat)?$', r'\1:ve_vztahu_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_zájem(:gen|:loc)?$', r'\1:v_zájmu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_záležitost(:gen)?$', r'\1:v_záležitosti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závěr(:gen)?$', r'\1:v_závěru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_na(:loc)?$', r'\1:v_závislosti_na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_s(:ins)?$', r'\1:v_závislosti_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_znamení(:gen)?$', r'\1:ve_znamení:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vina(:gen)?$', r'\1:vinou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vliv(:gen)?$', r'\1:vlivem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vzhledem(_k)?(:dat)?$', r'\1:vzhledem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_důvod(:gen)?$', r'\1:z_důvodu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_hledisko(:gen|:nom)?$', r'\1:z_hlediska:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_oblast(:gen)?$', r'\1:z_oblasti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_řada(:gen)?$', r'\1:z_řad:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?z_strana(:gen)?$', r'\1:ze_strany:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_nedostatek(:gen)?$', r'\1:z_nedostatku:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_titul(:gen)?$', r'\1:z_titulu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_pomoc(:gen)?$', r'\1:za_pomoci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účast(:gen)?$', r'\1:za_účasti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účel(:gen)?$', r'\1:za_účelem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):začátek(:gen)?$', r'\1:začátkem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):zásluha(:gen)?$', r'\1:zásluhou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závěr(:gen)?$', r'\1:závěrem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závisle_na(:loc)?$', r'\1:nezávisle_na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):že_za:gen$', r'\1:za:gen', edep['deprel']) + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) From 830b2fb6a3d2d7e09773842bf4f7d0994bbb5e02 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 10 Jan 2022 13:55:31 +0100 Subject: [PATCH 082/871] Removed spurious code. --- udapi/block/ud/sk/fixedeprels.py | 458 +------------------------------ 1 file changed, 1 insertion(+), 457 deletions(-) diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py index 6144a29b..c235ee78 100644 --- a/udapi/block/ud/sk/fixedeprels.py +++ b/udapi/block/ud/sk/fixedeprels.py @@ -5,24 +5,6 @@ class FixEdeprels(Block): - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):aby_na:loc$', r'\1:na:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_v(:loc)?$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_do(:gen)?$', r'\1:do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_o(:acc)?$', r'\1:o:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_k(:dat)?$', r'\1:bez_zřetele_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_na(:acc)?$', r'\1:bez_zřetele_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):blíž(:dat)?$', r'\1:blízko:dat', edep['deprel']) - # Secondary prepositions sometimes have the lemma of the original part of # speech. We want the grammaticalized form instead. List even those that # will have the same lexical form, as we also want to check the morphological @@ -34,6 +16,7 @@ class FixEdeprels(Block): 'mimo': 'mimo:gen', 'na_rozdiel_od': 'na_rozdiel_od:gen', 'na_základ': 'na_základe:gen', + 'od': 'od:gen', 'pomoc': 'pomocou:gen', 'pre': 'pre:acc', 'prostredníctvom': 'prostredníctvom:gen', @@ -78,445 +61,6 @@ def process_node(self, node): if m: edep['deprel'] = m.group(0)+':'+xnorm break - if re.match(r'^(acl|advcl):', edep['deprel']): - # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). - edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):abi$', r'\1:aby', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):ačkoliv$', r'\1:ačkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jak_aby$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jak_ad$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jakkoliv$', r'\1:jakkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):jako_kupříkladu$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):jakoby$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' - edep['deprel'] = re.sub(r'^(advcl):jelikož_do$', r'\1:jelikož', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jestli_že$', r'\1:jestliže', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):k$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:k$', r'obl:k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):kdykoliv$', r'\1:kdykoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):liž$', r'\1:li', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' - edep['deprel'] = re.sub(r'^acl:na_způsob$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' - edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating - edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):od$', r'nmod:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):podle$', r'obl:podle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:pro$', r'obl:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):takže_a$', r'\1:takže', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):v$', r'nmod:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):v_případ_že$', r'\1:v_případě_že', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v_duch$', r'obl:v_duchu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):zatím_co$', r'\1:zatímco', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):že_ať$', r'\1:ať', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):že_jako$', r'\1:že', edep['deprel']) - if edep['deprel'] == 'acl:v' and node.form == 'patře': - edep['deprel'] = 'nmod:v:loc' - node.deprel = 'nmod' - node.lemma = 'patro' - node.upos = 'NOUN' - node.xpos = 'NNNS6-----A----' - node.feats['Aspect'] = '' - node.feats['Gender'] = 'Neut' - node.feats['Tense'] = '' - node.feats['VerbForm'] = '' - node.feats['Voice'] = '' - edep['deprel'] = re.sub(r'^advcl:(od|do)$', r'obl:\1:gen', edep['deprel']) - elif re.match(r'^(nmod|obl):', edep['deprel']): - if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': - # This is a same-case noun-noun modifier, which just happens to be in the locative. - # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has - # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. - edep['deprel'] = 'nmod' - elif edep['deprel'] == 'obl:loc': - # Annotation error. The first occurrence in PDT dev: - # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' - # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. - # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. - edep['deprel'] = 'obl:v:loc' - elif edep['deprel'] == 'obl:arg:loc': - # Annotation error. The first occurrence in PDT dev: - edep['deprel'] = 'obl:arg:na:loc' - elif edep['deprel'] == 'nmod:loc': - # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. - edep['deprel'] = 'nmod:nom' - elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': - # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? - # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. - edep['deprel'] = 'obl' - elif edep['deprel'] == 'nmod:voc': - # 'v 8. čísle tiskoviny Ty rudá krávo' - edep['deprel'] = 'nmod:nom' - elif re.match(r'^(nmod|obl(:arg)?):během$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):bez$', edep['deprel']): - edep['deprel'] += ':gen' - elif edep['deprel'] == 'nmod:co:nom': - # Annotation error: 'kompatibilní znamená tolik co slučitelný' - # 'co' should be relative pronoun rather than subordinating conjunction. - edep['deprel'] = 'acl:relcl' - node.deprel = 'acl:relcl' - elif re.match(r'^(nmod|obl(:arg)?):díky$', edep['deprel']): - edep['deprel'] += ':dat' - elif re.match(r'^(nmod|obl(:arg)?):dle$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):do$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):k(:nom)?$', edep['deprel']): - edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + ':dat' - elif re.match(r'^(nmod|obl(:arg)?):kolem$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):kromě$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): - edep['deprel'] = 'advcl:li' - elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):mimo$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):místo$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^obl:místo_za:acc$', edep['deprel']): - # 'chytají krávu místo za rohy spíše za ocas' - # This should be treated as coordination; 'místo' and 'spíše' are adverbs (???); 'case' for 'místo' does not seem to be the optimal solution. - for c in node.children: - if c.form == 'místo': - c.upos = 'ADV' - c.deprel = 'cc' - edep['deprel'] = 'obl:za:acc' - elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) - elif re.match(r'^(nmod|obl(:arg)?):na$', edep['deprel']): - # The case is unknown. We need 'acc' or 'loc'. - # The locative is probably more frequent but it is not so likely with every noun. - # If there is an nummod:gov child, it must be accusative and not locative. - # (The case would be taken from the number but if it is expressed as digits, it does not have the case feature.) - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - elif re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' - elif re.match(r'^obl:arg:na_konec$', edep['deprel']): - # Annotation error. It should have been two prepositional phrases: 'snížil na 225 tisíc koncem minulého roku' - edep['deprel'] = 'obl:arg:na:acc' - elif re.match(r'^(nmod|obl(:arg)?):nad$', edep['deprel']): - if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):namísto$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):navzdory$', edep['deprel']): - edep['deprel'] += ':dat' - elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): - if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' - elif re.match(r'^(nmod|obl(:arg)?):od$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): - # Annotation error. - if node.form == 's': - ohled = node.next_node - na = ohled.next_node - noun = na.next_node - self.set_basic_and_enhanced(noun, node.parent, 'obl', 'obl:s_ohledem_na:acc') - self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') - self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') - self.set_basic_and_enhanced(node, noun, 'case', 'case') - elif re.match(r'^(nmod|obl(:arg)?):okolo$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):oproti$', edep['deprel']): - edep['deprel'] += ':dat' - elif re.match(r'^nmod:pára:nom$', edep['deprel']): - # Annotation error: 'par excellence'. - edep['deprel'] = 'nmod' - for c in node.children: - if c.udeprel == 'case' and c.form.lower() == 'par': - c.lemma = 'par' - c.upos = 'ADP' - c.xpos = 'RR--X----------' - c.feats['Case'] = '' - c.feats['Gender'] = '' - c.feats['Number'] = '' - c.feats['Polarity'] = '' - c.feats['AdpType'] = 'Prep' - elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): - ###!!! Taky bychom se mohli dívat do XPOS předložky, protože tam bude pád uveden! - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' - elif re.match(r'^(nmod|obl(:arg)?):poblíž$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): - if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):podle$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):pro$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):proti$', edep['deprel']): - edep['deprel'] += ':dat' - elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): - # Accusative would be possible but unlikely. - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):přes$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):při$', edep['deprel']): - edep['deprel'] += ':loc' - elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): - # Genitive would be possible but unlikely. - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):skrz$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):u$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):uprostřed$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):uvnitř$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': - # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. - # Find the content nominal. - cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] - vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] - if len(cnouns) > 0 and len(vs) > 0: - logging.info('I am here.') - cnoun = cnouns[0] - v = vs[0] - self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') - self.set_basic_and_enhanced(v, cnoun, 'case', 'case') - self.set_basic_and_enhanced(node, v, 'fixed', 'fixed') - elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): - # ':nom' occurs in 'karneval v Rio de Janeiro' - edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' - elif re.match(r'^obl:v_čel[eo]_s:ins$', edep['deprel']): - # There is just one occurrence and it is an error: - # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' - # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. - edep['deprel'] = 'obl:s:ins' - elif re.match(r'^(nmod|obl(:arg)?):včetně$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):vedle$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):vůči$', edep['deprel']): - edep['deprel'] += ':dat' - elif re.match(r'^(nmod|obl(:arg)?):z$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): - # Instrumental would be possible but unlikely. - edep['deprel'] += ':acc' - else: - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):aby_na:loc$', r'\1:na:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_v(:loc)?$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_do(:gen)?$', r'\1:do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_o(:acc)?$', r'\1:o:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_k(:dat)?$', r'\1:bez_zřetele_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_na(:acc)?$', r'\1:bez_zřetele_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):blíž(:dat)?$', r'\1:blízko:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta:ins$', r'\1:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta(:gen)?$', r'\1:cestou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko(:nom)?$', r'\1:nedaleko:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko_od(:gen)?$', r'\1:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):dík(:dat)?$', r'\1:díky:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do:(nom|dat)$', r'\1:do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_k:dat$', r'\1:k:dat', edep['deprel']) # do maloobchodní sítě (nebo k dalšímu zpracování) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_oblast(:gen)?$', r'\1:do_oblasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_rozpor_s(:ins)?$', r'\1:do_rozporu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_soulad_s(:ins)?$', r'\1:do_souladu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?forma(:gen)?$', r'\1:formou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby_pod:ins$', r'\1:pod:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k(:gen)?$', r'\1:k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k_konec(:gen)?$', r'\1:ke_konci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):kol(em)?(:gen)?$', r'\1:kolem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):konec(:gen)?$', r'\1:koncem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi_uvnitř:gen$', r'\1:uvnitř:gen', edep['deprel']) # 'nejdou mezi, ale uvnitř odvětví a oborů' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na(:gen|:nom)$', r'\1:na:acc', edep['deprel']) # 'odložit na 1. září' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_báze(:gen)?$', r'\1:na_bázi:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_čelo(:gen)?$', r'\1:na_čele:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_mimo:loc$', r'\1:na:loc', edep['deprel']) # 'na kurtě i mimo něj' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_než:acc$', r'\1:na:acc', edep['deprel']) # 'na víc než čtyři a půl kilometru' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_od:acc$', r'\1:na_rozdíl_od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_podklad(:gen)?$', r'\1:na_podkladě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_?rozdíl_od(:gen)?$', r'\1:na_rozdíl_od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_újma(:gen)?$', r'\1:gen', edep['deprel']) # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úroveň(:gen)?$', r'\1:na_úrovni:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úsek(:gen)?$', r'\1:na_úseku:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základ(:gen)?$', r'\1:na_základě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základna(:gen)?$', r'\1:na_základně:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_závěr(:gen)?$', r'\1:na_závěr:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):namísto_do(:gen)?$', r'\1:do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):následek(:gen)?$', r'\1:následkem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ne)?daleko(:gen)?$', r'\1:nedaleko:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):než[_:].+$', r'\1:než', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):nežli[_:].+$', r'\1:nežli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_jako[_:].+$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_o:acc$', r'\1:o:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):od:(nom|dat)$', r'\1:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ohledně(:gen)?$', r'\1:ohledně:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_v:loc$', r'\1:po:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_doba(:gen)?$', r'\1:po_dobu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_vzor(:gen)?$', r'\1:po_vzoru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počátek(:gen)?$', r'\1:počátkem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počínat(:ins)?$', r'\1:počínaje:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_dojem(:gen)?$', r'\1:pod_dojmem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_vliv(:gen)?$', r'\1:pod_vlivem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pomocí?(:gen)?$', r'\1:pomocí:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):postup(:gen)?$', r'\1:postupem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pouze_v(:loc)?$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pro:(nom|dat)$', r'\1:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):prostřednictvím?(:gen|:ins)?$', r'\1:prostřednictvím:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):proti:nom$', r'\1:proti:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_během:gen$', r'\1:během:gen', edep['deprel']) # 'před a během utkání' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_po:loc$', r'\1:po:loc', edep['deprel']) # 'před a po vyloučení Schindlera' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přes:gen$', r'\1:přes:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):při_příležitost(:gen)?$', r'\1:při_příležitosti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_k(:dat)?$', r'\1:s_ohledem_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_na(:acc)?$', r'\1:s_ohledem_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_pomoc(:gen)?$', r'\1:s_pomocí:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_k(:dat)?$', r'\1:s_přihlédnutím_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_na(:acc)?$', r'\1:s_přihlédnutím_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_výjimka(:gen)?$', r'\1:s_výjimkou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_vyloučení(:gen)?$', r'\1:s_vyloučením:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_k(:dat)?$', r'\1:se_zřetelem_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_na(:acc)?$', r'\1:se_zřetelem_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):severně_od(:gen)?$', r'\1:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_do(:gen)?$', r'\1:směrem_do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_k(:dat)?$', r'\1:směrem_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_na(:acc)?$', r'\1:směrem_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_od(:gen)?$', r'\1:směrem_od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):společně_s(:ins)?$', r'\1:společně_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):spolu(_s)?(:ins|:dat)?$', r'\1:spolu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):stranou(:gen|:dat)?$', r'\1:stranou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):takže[_:].+$', r'\1:takže', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):u_příležitost(:gen)?$', r'\1:u_příležitosti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_analogie_s(:ins)?$', r'\1:v_analogii_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo(:gen)?$', r'\1:v_čele:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo_s(:ins)?$', r'\1:v_čele_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_dohoda_s(:ins)?$', r'\1:v_dohodě_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_duch(:gen)?$', r'\1:v_duchu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_důsledek(:gen)?$', r'\1:v_důsledku:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_forma(:gen)?$', r'\1:ve_formě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_jméno(:gen)?$', r'\1:ve_jménu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_k:dat$', r'\1:k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kombinace_s(:ins)?$', r'\1:v_kombinaci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_konfrontace_s(:ins)?$', r'\1:v_konfrontaci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kontext_s(:ins)?$', r'\1:v_kontextu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_na:loc$', r'\1:na:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast(:gen)?$', r'\1:v_oblasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast_s(:ins)?$', r'\1:s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_obor(:gen)?$', r'\1:v_oboru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_otázka(:gen)?$', r'\1:v_otázce:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_podoba(:gen)?$', r'\1:v_podobě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_poměr_k(:dat)?$', r'\1:v_poměru_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_proces(:gen)?$', r'\1:v_procesu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_prospěch(:gen)?$', r'\1:ve_prospěch:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_protiklad_k(:dat)?$', r'\1:v_protikladu_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_průběh(:gen)?$', r'\1:v_průběhu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_případ(:gen)?$', r'\1:v_případě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rámec(:gen)?$', r'\1:v_rámci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rozpor_s(:ins)?$', r'\1:v_rozporu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_řada(:gen)?$', r'\1:v_řadě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_shoda_s(:ins)?$', r'\1:ve_shodě_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_služba(:gen)?$', r'\1:ve_službách:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr(:gen)?$', r'\1:ve_směru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr_k(:dat)?$', r'\1:ve_směru_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_smysl(:gen)?$', r'\1:ve_smyslu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_součinnost_s(:ins|:nom)?$', r'\1:v_součinnosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souhlas_s(:ins|:nom)?$', r'\1:v_souhlasu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_soulad_s(:ins|:nom)?$', r'\1:v_souladu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souvislost_s(:ins)?$', r'\1:v_souvislosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojení_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojený_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojitost_s(:ins)?$', r'\1:ve_spojitosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spolupráce_s(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_s_spolupráce(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_srovnání_se?(:ins)?$', r'\1:ve_srovnání_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_světlo(:gen)?$', r'\1:ve_světle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_věc(:gen)?$', r'\1:ve_věci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_vztah_k(:dat)?$', r'\1:ve_vztahu_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_zájem(:gen|:loc)?$', r'\1:v_zájmu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_záležitost(:gen)?$', r'\1:v_záležitosti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závěr(:gen)?$', r'\1:v_závěru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_na(:loc)?$', r'\1:v_závislosti_na:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_s(:ins)?$', r'\1:v_závislosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_znamení(:gen)?$', r'\1:ve_znamení:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vina(:gen)?$', r'\1:vinou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vliv(:gen)?$', r'\1:vlivem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vzhledem(_k)?(:dat)?$', r'\1:vzhledem_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_důvod(:gen)?$', r'\1:z_důvodu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_hledisko(:gen|:nom)?$', r'\1:z_hlediska:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_oblast(:gen)?$', r'\1:z_oblasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_řada(:gen)?$', r'\1:z_řad:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?z_strana(:gen)?$', r'\1:ze_strany:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_nedostatek(:gen)?$', r'\1:z_nedostatku:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_titul(:gen)?$', r'\1:z_titulu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_pomoc(:gen)?$', r'\1:za_pomoci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účast(:gen)?$', r'\1:za_účasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účel(:gen)?$', r'\1:za_účelem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):začátek(:gen)?$', r'\1:začátkem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):zásluha(:gen)?$', r'\1:zásluhou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závěr(:gen)?$', r'\1:závěrem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závisle_na(:loc)?$', r'\1:nezávisle_na:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):že_za:gen$', r'\1:za:gen', edep['deprel']) def set_basic_and_enhanced(self, node, parent, deprel, edeprel): ''' From 46819d52b6949d7ac0ddb4dc5c7dc6f84be7469b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 10 Jan 2022 15:27:10 +0100 Subject: [PATCH 083/871] Refined processing of Slovak edeprels. --- udapi/block/ud/sk/fixedeprels.py | 56 +++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py index c235ee78..d029a031 100644 --- a/udapi/block/ud/sk/fixedeprels.py +++ b/udapi/block/ud/sk/fixedeprels.py @@ -21,6 +21,7 @@ class FixEdeprels(Block): 'pre': 'pre:acc', 'prostredníctvom': 'prostredníctvom:gen', 's': 's:ins', + 's_cieľ': 's_cieľom', # no case, used with infinitives (advcl) 's_dôraz_na': 's_dôrazom_na:acc', 's_ohľad_na': 's_ohľadom_na:acc', 's_pomoc': 's_pomocou:gen', @@ -30,7 +31,8 @@ class FixEdeprels(Block): 'v_dôsledok': 'v_dôsledku:gen', 'v_meno': 'v_mene:gen', 'v_oblasť': 'v_oblasti:gen', - 'v_porovnanie_s': 'v_porovnaniu_s:ins', + 'v_porovnanie_s': 'v_porovnaní_s:ins', + 'v_porovnaniu_s': 'v_porovnaní_s:ins', 'v_priebeh': 'v_priebehu:gen', 'v_prípad': 'v_prípade:gen', 'v_prospech': 'v_prospech:gen', @@ -54,13 +56,51 @@ def process_node(self, node): abbreviation and its morphological case is unknown. """ for edep in node.deps: - for x, xnorm in unambiguous: - # All secondary prepositions have only one fixed morphological case - # they appear with, so we can replace whatever case we encounter with the correct one. - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) - if m: - edep['deprel'] = m.group(0)+':'+xnorm - break + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + if not solved: + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. + m = re.match(r'^(obl(?::arg)?|nmod):(medzi|na|o|po|pred|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True + if not solved: + # If we failed to identify the case of the preposition in the + # preceding steps, pick a default. It applies mostly to 'o' + # with wrongly split time values. + m = re.match(r'^(obl(?::arg)?|nmod):o$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':o:acc' + solved = True + m = re.match(r'^(obl(?::arg)?|nmod):(po|v)$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' + solved = True + if not solved: + # Some cases do not occur with nominal modifiers without preposition. + # If we see them, chances are that it is the same-case modifier, + # and the same case just happens to be the one we see. For vocatives, + # it is also possible that they have been confused with nominatives. + m = re.match(r'^(obl(?::arg)?|nmod):(voc|loc)$', edep['deprel']) + if m: + edep['deprel'] = m.group(1) + solved = True def set_basic_and_enhanced(self, node, parent, deprel, edeprel): ''' From 05625f09c8a503d805644d1c31f988b6fbdbe81c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 10 Jan 2022 16:43:36 +0100 Subject: [PATCH 084/871] Refined processing of Slovak edeprels. --- udapi/block/ud/sk/fixedeprels.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py index d029a031..4c19be89 100644 --- a/udapi/block/ud/sk/fixedeprels.py +++ b/udapi/block/ud/sk/fixedeprels.py @@ -17,6 +17,7 @@ class FixEdeprels(Block): 'na_rozdiel_od': 'na_rozdiel_od:gen', 'na_základ': 'na_základe:gen', 'od': 'od:gen', + 'pod_vplyv': 'pod_vplyvom:gen', 'pomoc': 'pomocou:gen', 'pre': 'pre:acc', 'prostredníctvom': 'prostredníctvom:gen', From 891cf8966a98d4fc3dc781036aa26e7a652fa316 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 11 Jan 2022 11:04:09 +0100 Subject: [PATCH 085/871] Slovak cased edeprels. --- udapi/block/ud/sk/fixedeprels.py | 35 +++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py index 4c19be89..7208b6ef 100644 --- a/udapi/block/ud/sk/fixedeprels.py +++ b/udapi/block/ud/sk/fixedeprels.py @@ -11,8 +11,16 @@ class FixEdeprels(Block): # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { + 'a_hoci': 'hoci', + 'ako': 'ako', # remove morphological case + 'ako_na': 'ako', + 'akoby_z': 'z:gen', + 'akže': 'ak', + 'ani_keby': 'keby', + 'až_keď': 'keď', 'do': 'do:gen', 'k': 'k:dat', + 'kto': 'kým', ###!!! The lemma should be fixed! The pronoun has grammaticalized as a subordinator. 'mimo': 'mimo:gen', 'na_rozdiel_od': 'na_rozdiel_od:gen', 'na_základ': 'na_základe:gen', @@ -21,6 +29,7 @@ class FixEdeprels(Block): 'pomoc': 'pomocou:gen', 'pre': 'pre:acc', 'prostredníctvom': 'prostredníctvom:gen', + 'prv_ako': 'ako', 's': 's:ins', 's_cieľ': 's_cieľom', # no case, used with infinitives (advcl) 's_dôraz_na': 's_dôrazom_na:acc', @@ -69,10 +78,10 @@ def process_node(self, node): edep['deprel'] = m.group(1)+':'+self.unambiguous[x] solved = True break + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. if not solved: - # The following prepositions have more than one morphological case - # available. Thanks to the Case feature on prepositions, we can - # identify the correct one. m = re.match(r'^(obl(?::arg)?|nmod):(medzi|na|o|po|pred|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) if m: # The following is only partial solution. We will not see @@ -81,10 +90,10 @@ def process_node(self, node): if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() solved = True + # If we failed to identify the case of the preposition in the + # preceding steps, pick a default. It applies mostly to 'o' + # with wrongly split time values. if not solved: - # If we failed to identify the case of the preposition in the - # preceding steps, pick a default. It applies mostly to 'o' - # with wrongly split time values. m = re.match(r'^(obl(?::arg)?|nmod):o$', edep['deprel']) if m: edep['deprel'] = m.group(1)+':o:acc' @@ -93,15 +102,21 @@ def process_node(self, node): if m: edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' solved = True + # Some cases do not occur with nominal modifiers without preposition. + # If we see them, chances are that it is the same-case modifier, + # and the same case just happens to be the one we see. For vocatives, + # it is also possible that they have been confused with nominatives. if not solved: - # Some cases do not occur with nominal modifiers without preposition. - # If we see them, chances are that it is the same-case modifier, - # and the same case just happens to be the one we see. For vocatives, - # it is also possible that they have been confused with nominatives. m = re.match(r'^(obl(?::arg)?|nmod):(voc|loc)$', edep['deprel']) if m: edep['deprel'] = m.group(1) solved = True + # Annotation and conversion errors. + if not solved: + # Povedal som jej „na zdorovie“. + if edep['deprel'] == 'obl:arg:na' and node.form == 'zdorovie': + self.set_basic_and_enhanced(node, edep['parent'], 'ccomp', 'ccomp') + solved = True def set_basic_and_enhanced(self, node, parent, deprel, edeprel): ''' From a2e91a334e98fd4abe2bdc71b7e35ba314008399 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 17 Jan 2022 02:37:44 +0100 Subject: [PATCH 086/871] more params for corefud.MarkCrossing --- udapi/block/corefud/markcrossing.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/markcrossing.py b/udapi/block/corefud/markcrossing.py index 81136ec9..f357e7cc 100644 --- a/udapi/block/corefud/markcrossing.py +++ b/udapi/block/corefud/markcrossing.py @@ -1,15 +1,19 @@ from udapi.core.block import Block import udapi.core.coref import itertools +import logging class MarkCrossing(Block): """Find mentions with crossing spans.""" - def __init__(self, same_cluster_only=False, continuous_only=False, print_form=False, **kwargs): + def __init__(self, same_cluster_only=False, continuous_only=False, print_form=False, + log=True, mark=True, **kwargs): super().__init__(**kwargs) self.same_cluster_only = same_cluster_only self.continuous_only = continuous_only self.print_form = print_form + self.log = log + self.mark = mark def _print(self, mention): if self.print_form: @@ -25,4 +29,8 @@ def process_node(self, node): continue if self.continuous_only and (',' in mA.span or ',' in mB.span): continue - node.misc['Mark'] = f'cross:{self._print(mA)}+{self._print(mB)}' + msg = f'cross:{self._print(mA)}+{self._print(mB)}' + if self.mark: + node.misc['Mark'] = msg + if self.log: + print(msg) From 136ef06885b66eb68ea06d6a14d26e7d428c0354 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 18 Jan 2022 14:31:59 +0100 Subject: [PATCH 087/871] log each crossing just once (not for each node in the intersection) --- udapi/block/corefud/markcrossing.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/udapi/block/corefud/markcrossing.py b/udapi/block/corefud/markcrossing.py index f357e7cc..a6d9346a 100644 --- a/udapi/block/corefud/markcrossing.py +++ b/udapi/block/corefud/markcrossing.py @@ -14,6 +14,7 @@ def __init__(self, same_cluster_only=False, continuous_only=False, print_form=Fa self.print_form = print_form self.log = log self.mark = mark + self._logged = {} def _print(self, mention): if self.print_form: @@ -29,8 +30,10 @@ def process_node(self, node): continue if self.continuous_only and (',' in mA.span or ',' in mB.span): continue - msg = f'cross:{self._print(mA)}+{self._print(mB)}' if self.mark: - node.misc['Mark'] = msg + node.misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" if self.log: - print(msg) + cross_id = node.root.sent_id + mA.span + mB.span + if cross_id not in self._logged: + self._logged[cross_id] = True + print(f"crossing mentions at {node}: {self._print(mA)} + {self._print(mB)}") From f0e76516eb0376493486db66e408dc2995f9acbc Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 19 Jan 2022 22:05:21 +0100 Subject: [PATCH 088/871] util.FindBug can take any params so e.g. `util.FindBug block=eval.F1 focus=NOUN` will result in inspecting `eval.F1 focus=NOUN`. --- udapi/block/util/findbug.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/udapi/block/util/findbug.py b/udapi/block/util/findbug.py index e05afe76..e1ea838c 100644 --- a/udapi/block/util/findbug.py +++ b/udapi/block/util/findbug.py @@ -5,9 +5,12 @@ insert "util.FindBug block=" into the scenario, e.g. to debug ``second.Block``, use -udapy first.Block util.FindBug block=second.Block > bug.conllu + udapy first.Block util.FindBug block=second.Block > bug.conllu This will create the file bug.conllu with the bundle, which caused the bug. + +The second.Block can have any parameters, e.g. + udapy first.Block util.FindBug block=second.Block param1=value1 param2=value2 > bug.conllu """ import copy import logging @@ -20,24 +23,31 @@ class FindBug(BaseWriter): """Debug another block by finding a minimal testcase conllu file.""" - def __init__(self, block, first_error_only=True, **kwargs): - """Args: block, first_error_only""" - super().__init__(**kwargs) + def __init__(self, block, first_error_only=True, + files='-', filehandle=None, docname_as_file=False, encoding='utf-8', + newline='\n', overwrite=False, + **kwargs): + """Args: block, first_error_only. + All other parameters (which are not parameters of BaseWriter) + will be passed to the block being inspected. + """ + super().__init__(files, filehandle, docname_as_file, encoding, newline, overwrite) self.block = block self.first_error_only = first_error_only + self._kwargs = kwargs def process_document(self, document): sub_path, class_name = _parse_block_name(self.block) module = "udapi.block." + sub_path + "." + class_name.lower() try: - command = "from " + module + " import " + class_name + " as b" + command = "from " + module + " import " + class_name + " as B" logging.debug("Trying to run command: %s", command) exec(command) # pylint: disable=exec-used except Exception: logging.warning("Error when trying import the block %s", self.block) raise - command = "b()" # TODO params as kwargs + command = "B(**self._kwargs)" logging.debug("Trying to evaluate this: %s", command) new_block = eval(command) # pylint: disable=eval-used From 6e64786578eab324257dae04a3036c29e4d42a7e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 19 Jan 2022 22:37:16 +0100 Subject: [PATCH 089/871] bugfix in eval.F1 When a pair of sentences contains no non-focused tokens, `nf_common == []` and we cannot use `while nf_common[c] != pred_tokens[i]`. --- udapi/block/eval/f1.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/eval/f1.py b/udapi/block/eval/f1.py index 9f265ac7..ca5510e4 100644 --- a/udapi/block/eval/f1.py +++ b/udapi/block/eval/f1.py @@ -125,6 +125,9 @@ def process_tree(self, tree): nf_common = find_lcs(nf_pred_tokens, nf_gold_tokens) i, j, c, un_pred, un_gold, common = 0, 0, 0, [], [], [] while i < len(pred_tokens) and j < len(gold_tokens): + if c == len(nf_common): + common += find_lcs(pred_tokens[i+1:], gold_tokens[j+1:]) + break while nf_common[c] != pred_tokens[i]: un_pred.append(pred_tokens[i]) i += 1 @@ -135,9 +138,6 @@ def process_tree(self, tree): un_pred, un_gold = [], [] while c < len(nf_common) and nf_common[c] == pred_tokens[i] and nf_common[c] == gold_tokens[j]: i, j, c = i+1, j+1, c+1 - if c == len(nf_common): - common += find_lcs(pred_tokens[i+1:], gold_tokens[j+1:]) - break common = [x for x in common if self.focus.fullmatch(x)] pred_tokens = [x for x in pred_tokens if self.focus.fullmatch(x)] gold_tokens = [x for x in gold_tokens if self.focus.fullmatch(x)] From 7bcde4d7cb1adf8a4fc02882504fa0ff7a22654e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 9 Feb 2022 11:18:19 +0100 Subject: [PATCH 090/871] Reorganized the Czech block following Slovak, so it is more readable. --- udapi/block/ud/cs/fixedeprels.py | 460 ++++++++++++++++--------------- 1 file changed, 235 insertions(+), 225 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 5a2e996d..ac2653c3 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -5,6 +5,217 @@ class FixEdeprels(Block): + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'abi': 'aby', + 'aby_na': 'na', + 'ačkoliv': 'ačkoli', + 'ať': 'ať', # remove morphological case + 'ať_forma': 'formou:gen', + 'ať_v': 'v:loc', + 'ať_z': 'z:gen', + 'ať_z_strana': 'ze_strany:gen', + 'až_do': 'do:gen', + 'až_o': 'o:acc', + 'během': 'během:gen', + 'bez': 'bez:gen', + 'bez_ohled_na': 'bez_ohledu_na:acc', + 'bez_zřetel_k': 'bez_zřetele_k:dat', + 'bez_zřetel_na': 'bez_zřetele_na:acc', + 'blíž': 'blízko:dat', + 'cesta': 'cestou:gen', + 'daleko': 'nedaleko:gen', + 'daleko_od': 'od:gen', + 'dík': 'díky:dat', + 'díky': 'díky:dat', + 'dle': 'dle:gen', + 'do': 'do:gen', + 'do_k': 'k:dat', + 'do_oblast': 'do_oblasti:gen', + 'do_rozpor_s': 'do_rozporu_s:ins', + 'do_soulad_s': 'do_souladu_s:ins', + 'forma': 'formou:gen', + 'i_když': 'i_když', # remove morphological case + 'jak_aby': 'jak', + 'jak_ad': 'jak', + 'jakkoliv': 'jakkoli', + 'jako': 'jako', # remove morphological case + 'jako_kupříkladu': 'jako', + 'jakoby': 'jako', + 'jakoby_pod': 'pod:ins', + 'jelikož_do': 'jelikož', + 'jestli_že': 'jestliže', + 'k': 'k:dat', + 'k_konec': 'ke_konci:gen', + 'kdykoliv': 'kdykoli', + 'kol': 'kolem:gen', + 'kolem': 'kolem:gen', + 'konec': 'koncem:gen', + 'kromě': 'kromě:gen', + 'liž': 'li', + 'mezi_uvnitř': 'uvnitř:gen', + 'na_báze': 'na_bázi:gen', + 'na_čelo': 'na_čele:gen', + 'na_mimo': 'na:loc', # na kurtě i mimo něj + 'na_než': 'na:acc', # na víc než čtyři a půl kilometru + 'na_od': 'na_rozdíl_od:gen', + 'na_podklad': 'na_podkladě:gen', + 'na_rozdíl_od': 'na_rozdíl_od:gen', + 'na_újma': 'gen', # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier + 'na_úroveň': 'na_úrovni:gen', + 'na_úsek': 'na_úseku:gen', + 'na_základ': 'na_základě:gen', + 'na_základna': 'na_základně:gen', + 'na_závěr': 'na_závěr:gen', + 'namísto': 'namísto:gen', + 'namísto_do': 'do:gen', + 'narozdíl_od': 'na_rozdíl_od:gen', + 'následek': 'následkem:gen', + 'navzdory': 'navzdory:dat', + 'nedaleko': 'nedaleko:gen', + 'než': 'než', # remove morphological case + 'nežli': 'nežli', # remove morphological case + 'o_jako': 'jako', + 'o_o': 'o:acc', + 'od': 'od:gen', + 'ohledně': 'ohledně:gen', + 'okolo': 'okolo:gen', + 'oproti': 'oproti:dat', + 'po_v': 'po:loc', + 'po_doba': 'po_dobu:gen', + 'po_vzor': 'po_vzoru:gen', + 'poblíž': 'poblíž:gen', + 'počátek': 'počátkem:gen', + 'počínat': 'počínaje:ins', + 'pod_dojem': 'pod_dojmem:gen', + 'pod_vliv': 'pod_vlivem:gen', + 'podle': 'podle:gen', + 'pomoc': 'pomocí:gen', + 'pomocí': 'pomocí:gen', + 'postup': 'postupem:gen', + 'pouze_v': 'v:loc', + 'pro': 'pro:acc', + 'prostřednictví': 'prostřednictvím:gen', + 'prostřednictvím': 'prostřednictvím:gen', + 'proti': 'proti:dat', + 'protože': 'protože', # remove morphological case + 'před_během': 'během:gen', # před a během utkání + 'před_po': 'po:loc', # před a po vyloučení Schindlera + 'přes': 'přes:acc', + 'přestože': 'přestože', # remove morphological case + 'při': 'při:loc', + 'při_příležitost': 'při_příležitosti:gen', + 's_ohled_k': 's_ohledem_k:dat', + 's_ohled_na': 's_ohledem_na:acc', + 's_pomoc': 's_pomocí:gen', + 's_přihlédnutí_k': 's_přihlédnutím_k:dat', + 's_přihlédnutí_na': 's_přihlédnutím_na:acc', + 's_výjimka': 's_výjimkou:gen', + 's_vyloučení': 's_vyloučením:gen', + 's_zřetel_k': 'se_zřetelem_k:dat', + 's_zřetel_na': 'se_zřetelem_na:acc', + 'severně_od': 'od:gen', + 'skrz': 'skrz:acc', + 'směr_do': 'směrem_do:gen', + 'směr_k': 'směrem_k:dat', + 'směr_na': 'směrem_na:acc', + 'směr_od': 'směrem_od:gen', + 'společně_s': 'společně_s:ins', + 'spolu': 'spolu_s:ins', + 'spolu_s': 'spolu_s:ins', + 'stranou': 'stranou:gen', + 'takže': 'takže', # remove morphological case + 'takže_a': 'takže', + 'třebaže': 'třebaže', # remove morphological case + 'u': 'u:gen', + 'u_příležitost': 'u_příležitosti:gen', + 'uprostřed': 'uprostřed:gen', + 'uvnitř': 'uvnitř:gen', + 'v_analogie_s': 'v_analogii_s:ins', + 'v_čelo': 'v_čele:gen', + 'v_čelo_s': 'v_čele_s:ins', + 'v_dohoda_s': 'v_dohodě_s:ins', + 'v_duch': 'v_duchu:gen', + 'v_důsledek': 'v_důsledku:gen', + 'v_forma': 've_formě:gen', + 'v_jméno': 've_jménu:gen', + 'v_k': 'k:dat', + 'v_kombinace_s': 'v_kombinaci_s:ins', + 'v_konfrontace_s': 'v_konfrontaci_s:ins', + 'v_kontext_s': 'v_kontextu_s:ins', + 'v_na': 'na:loc', + 'v_oblast': 'v_oblasti:gen', + 'v_oblast_s': 's:ins', + 'v_obor': 'v_oboru:gen', + 'v_otázka': 'v_otázce:gen', + 'v_podoba': 'v_podobě:gen', + 'v_poměr_k': 'v_poměru_k:dat', + 'v_proces': 'v_procesu:gen', + 'v_prospěch': 've_prospěch:gen', + 'v_protiklad_k': 'v_protikladu_k:dat', + 'v_průběh': 'v_průběhu:gen', + 'v_případ': 'v_případě:gen', + 'v_případ_že': 'v_případě_že', + 'v_rámec': 'v_rámci:gen', + 'v_rozpor_s': 'v_rozporu_s:ins', + 'v_řada': 'v_řadě:gen', + 'v_shoda_s': 've_shodě_s:ins', + 'v_služba': 've_službách:gen', + 'v_směr': 've_směru:gen', + 'v_směr_k': 've_směru_k:dat', + 'v_smysl': 've_smyslu:gen', + 'v_součinnost_s': 'v_součinnosti_s:ins', + 'v_souhlas_s': 'v_souhlasu_s:ins', + 'v_soulad_s': 'v_souladu_s:ins', + 'v_souvislost_s': 'v_souvislosti_s:ins', + 'v_spojení_s': 've_spojení_s:ins', + 'v_spojený_s': 've_spojení_s:ins', + 'v_spojitost_s': 've_spojitosti_s:ins', + 'v_spolupráce_s': 've_spolupráci_s:ins', + 'v_s_spolupráce': 've_spolupráci_s:ins', + 'v_srovnání_s': 've_srovnání_s:ins', + 'v_srovnání_se': 've_srovnání_s:ins', + 'v_světlo': 've_světle:gen', + 'v_věc': 've_věci:gen', + 'v_vztah_k': 've_vztahu_k:dat', + 'v_zájem': 'v_zájmu:gen', + 'v_záležitost': 'v_záležitosti:gen', + 'v_závěr': 'v_závěru:gen', + 'v_závislost_na': 'v_závislosti_na:loc', + 'v_závislost_s': 'v_závislosti_s:ins', + 'v_znamení': 've_znamení:gen', + 'včetně': 'včetně:gen', + 'vedle': 'vedle:gen', + 'vina': 'vinou:gen', + 'vliv': 'vlivem:gen', + 'vůči': 'vůči:dat', + 'vzhledem': 'vzhledem_k:dat', + 'vzhledem_k': 'vzhledem_k:dat', + 'z': 'z:gen', + 'z_důvod': 'z_důvodu:gen', + 'z_hledisko': 'z_hlediska:gen', + 'z_oblast': 'z_oblasti:gen', + 'z_řada': 'z_řad:gen', + 'z_strana': 'ze_strany:gen', + 'z_nedostatek': 'z_nedostatku:gen', + 'z_titul': 'z_titulu:gen', + 'za_pomoc': 'za_pomoci:gen', + 'za_účast': 'za_účasti:gen', + 'za_účel': 'za_účelem:gen', + 'začátek': 'začátkem:gen', + 'zásluha': 'zásluhou:gen', + 'zatím_co': 'zatímco', + 'závěr': 'závěrem:gen', + 'závisle_na': 'nezávisle_na:loc', + 'že_ať': 'ať', + 'že_jako': 'že', + 'že_za': 'za:gen' + } + def process_node(self, node): """ Occasionally the edeprels automatically derived from the Czech basic @@ -12,26 +223,39 @@ def process_node(self, node): abbreviation and its morphological case is unknown. """ for edep in node.deps: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. + if not solved: + m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True if re.match(r'^(acl|advcl):', edep['deprel']): # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):abi$', r'\1:aby', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):ačkoliv$', r'\1:ačkoli', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jak_aby$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jak_ad$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jakkoliv$', r'\1:jakkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):jako_kupříkladu$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):jakoby$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' - edep['deprel'] = re.sub(r'^(advcl):jelikož_do$', r'\1:jelikož', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jestli_že$', r'\1:jestliže', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):k$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:k$', r'obl:k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):kdykoliv$', r'\1:kdykoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):liž$', r'\1:li', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' edep['deprel'] = re.sub(r'^acl:na_způsob$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating @@ -39,14 +263,9 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(acl):od$', r'nmod:od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):podle$', r'obl:podle:gen', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:pro$', r'obl:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):takže_a$', r'\1:takže', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):v$', r'nmod:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):v_případ_že$', r'\1:v_případě_že', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:v_duch$', r'obl:v_duchu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):zatím_co$', r'\1:zatímco', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):že_ať$', r'\1:ať', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):že_jako$', r'\1:že', edep['deprel']) if edep['deprel'] == 'acl:v' and node.form == 'patře': edep['deprel'] = 'nmod:v:loc' node.deprel = 'nmod' @@ -84,27 +303,11 @@ def process_node(self, node): elif edep['deprel'] == 'nmod:voc': # 'v 8. čísle tiskoviny Ty rudá krávo' edep['deprel'] = 'nmod:nom' - elif re.match(r'^(nmod|obl(:arg)?):během$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):bez$', edep['deprel']): - edep['deprel'] += ':gen' elif edep['deprel'] == 'nmod:co:nom': # Annotation error: 'kompatibilní znamená tolik co slučitelný' # 'co' should be relative pronoun rather than subordinating conjunction. edep['deprel'] = 'acl:relcl' node.deprel = 'acl:relcl' - elif re.match(r'^(nmod|obl(:arg)?):díky$', edep['deprel']): - edep['deprel'] += ':dat' - elif re.match(r'^(nmod|obl(:arg)?):dle$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):do$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):k(:nom)?$', edep['deprel']): - edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + ':dat' - elif re.match(r'^(nmod|obl(:arg)?):kolem$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):kromě$', edep['deprel']): - edep['deprel'] += ':gen' elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): edep['deprel'] = 'advcl:li' elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): @@ -145,17 +348,11 @@ def process_node(self, node): edep['deprel'] += ':acc' else: edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):namísto$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):navzdory$', edep['deprel']): - edep['deprel'] += ':dat' elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: edep['deprel'] += ':acc' else: edep['deprel'] += ':loc' - elif re.match(r'^(nmod|obl(:arg)?):od$', edep['deprel']): - edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): # Annotation error. if node.form == 's': @@ -166,10 +363,6 @@ def process_node(self, node): self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') self.set_basic_and_enhanced(node, noun, 'case', 'case') - elif re.match(r'^(nmod|obl(:arg)?):okolo$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):oproti$', edep['deprel']): - edep['deprel'] += ':dat' elif re.match(r'^nmod:pára:nom$', edep['deprel']): # Annotation error: 'par excellence'. edep['deprel'] = 'nmod' @@ -184,42 +377,21 @@ def process_node(self, node): c.feats['Polarity'] = '' c.feats['AdpType'] = 'Prep' elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): - ###!!! Taky bychom se mohli dívat do XPOS předložky, protože tam bude pád uveden! if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: edep['deprel'] += ':acc' else: edep['deprel'] += ':loc' - elif re.match(r'^(nmod|obl(:arg)?):poblíž$', edep['deprel']): - edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: edep['deprel'] += ':acc' else: edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):podle$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):pro$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):proti$', edep['deprel']): - edep['deprel'] += ':dat' elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): # Accusative would be possible but unlikely. edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):přes$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):při$', edep['deprel']): - edep['deprel'] += ':loc' elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): # Genitive would be possible but unlikely. edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):skrz$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):u$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):uprostřed$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):uvnitř$', edep['deprel']): - edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. # Find the content nominal. @@ -244,51 +416,23 @@ def process_node(self, node): # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. edep['deprel'] = 'obl:s:ins' - elif re.match(r'^(nmod|obl(:arg)?):včetně$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):vedle$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):vůči$', edep['deprel']): - edep['deprel'] += ':dat' - elif re.match(r'^(nmod|obl(:arg)?):z$', edep['deprel']): - edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): # Instrumental would be possible but unlikely. edep['deprel'] += ':acc' else: edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):aby_na:loc$', r'\1:na:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_v(:loc)?$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_do(:gen)?$', r'\1:do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_o(:acc)?$', r'\1:o:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_k(:dat)?$', r'\1:bez_zřetele_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_na(:acc)?$', r'\1:bez_zřetele_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):blíž(:dat)?$', r'\1:blízko:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta:ins$', r'\1:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta(:gen)?$', r'\1:cestou:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko(:nom)?$', r'\1:nedaleko:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko_od(:gen)?$', r'\1:od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):dík(:dat)?$', r'\1:díky:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do:(nom|dat)$', r'\1:do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_k:dat$', r'\1:k:dat', edep['deprel']) # do maloobchodní sítě (nebo k dalšímu zpracování) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_oblast(:gen)?$', r'\1:do_oblasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_rozpor_s(:ins)?$', r'\1:do_rozporu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_soulad_s(:ins)?$', r'\1:do_souladu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?forma(:gen)?$', r'\1:formou:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL @@ -296,161 +440,27 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby_pod:ins$', r'\1:pod:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k(:gen)?$', r'\1:k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k_konec(:gen)?$', r'\1:ke_konci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):kol(em)?(:gen)?$', r'\1:kolem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):konec(:gen)?$', r'\1:koncem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi_uvnitř:gen$', r'\1:uvnitř:gen', edep['deprel']) # 'nejdou mezi, ale uvnitř odvětví a oborů' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na(:gen|:nom)$', r'\1:na:acc', edep['deprel']) # 'odložit na 1. září' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_báze(:gen)?$', r'\1:na_bázi:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_čelo(:gen)?$', r'\1:na_čele:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_mimo:loc$', r'\1:na:loc', edep['deprel']) # 'na kurtě i mimo něj' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_než:acc$', r'\1:na:acc', edep['deprel']) # 'na víc než čtyři a půl kilometru' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_od:acc$', r'\1:na_rozdíl_od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_podklad(:gen)?$', r'\1:na_podkladě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_?rozdíl_od(:gen)?$', r'\1:na_rozdíl_od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_újma(:gen)?$', r'\1:gen', edep['deprel']) # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úroveň(:gen)?$', r'\1:na_úrovni:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úsek(:gen)?$', r'\1:na_úseku:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základ(:gen)?$', r'\1:na_základě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základna(:gen)?$', r'\1:na_základně:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_závěr(:gen)?$', r'\1:na_závěr:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):namísto_do(:gen)?$', r'\1:do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):následek(:gen)?$', r'\1:následkem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ne)?daleko(:gen)?$', r'\1:nedaleko:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):než[_:].+$', r'\1:než', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):nežli[_:].+$', r'\1:nežli', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_jako[_:].+$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_o:acc$', r'\1:o:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):od:(nom|dat)$', r'\1:od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ohledně(:gen)?$', r'\1:ohledně:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_v:loc$', r'\1:po:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_doba(:gen)?$', r'\1:po_dobu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_vzor(:gen)?$', r'\1:po_vzoru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počátek(:gen)?$', r'\1:počátkem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počínat(:ins)?$', r'\1:počínaje:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_dojem(:gen)?$', r'\1:pod_dojmem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_vliv(:gen)?$', r'\1:pod_vlivem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pomocí?(:gen)?$', r'\1:pomocí:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):postup(:gen)?$', r'\1:postupem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pouze_v(:loc)?$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pro:(nom|dat)$', r'\1:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):prostřednictvím?(:gen|:ins)?$', r'\1:prostřednictvím:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):proti:nom$', r'\1:proti:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_během:gen$', r'\1:během:gen', edep['deprel']) # 'před a během utkání' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_po:loc$', r'\1:po:loc', edep['deprel']) # 'před a po vyloučení Schindlera' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přes:gen$', r'\1:přes:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):při_příležitost(:gen)?$', r'\1:při_příležitosti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_k(:dat)?$', r'\1:s_ohledem_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_na(:acc)?$', r'\1:s_ohledem_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_pomoc(:gen)?$', r'\1:s_pomocí:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_k(:dat)?$', r'\1:s_přihlédnutím_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_na(:acc)?$', r'\1:s_přihlédnutím_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_výjimka(:gen)?$', r'\1:s_výjimkou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_vyloučení(:gen)?$', r'\1:s_vyloučením:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_k(:dat)?$', r'\1:se_zřetelem_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_na(:acc)?$', r'\1:se_zřetelem_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):severně_od(:gen)?$', r'\1:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_do(:gen)?$', r'\1:směrem_do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_k(:dat)?$', r'\1:směrem_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_na(:acc)?$', r'\1:směrem_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_od(:gen)?$', r'\1:směrem_od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):společně_s(:ins)?$', r'\1:společně_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):spolu(_s)?(:ins|:dat)?$', r'\1:spolu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):stranou(:gen|:dat)?$', r'\1:stranou:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):takže[_:].+$', r'\1:takže', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):u_příležitost(:gen)?$', r'\1:u_příležitosti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_analogie_s(:ins)?$', r'\1:v_analogii_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo(:gen)?$', r'\1:v_čele:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo_s(:ins)?$', r'\1:v_čele_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_dohoda_s(:ins)?$', r'\1:v_dohodě_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_duch(:gen)?$', r'\1:v_duchu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_důsledek(:gen)?$', r'\1:v_důsledku:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_forma(:gen)?$', r'\1:ve_formě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_jméno(:gen)?$', r'\1:ve_jménu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_k:dat$', r'\1:k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kombinace_s(:ins)?$', r'\1:v_kombinaci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_konfrontace_s(:ins)?$', r'\1:v_konfrontaci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kontext_s(:ins)?$', r'\1:v_kontextu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_na:loc$', r'\1:na:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast(:gen)?$', r'\1:v_oblasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast_s(:ins)?$', r'\1:s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_obor(:gen)?$', r'\1:v_oboru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_otázka(:gen)?$', r'\1:v_otázce:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_podoba(:gen)?$', r'\1:v_podobě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_poměr_k(:dat)?$', r'\1:v_poměru_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_proces(:gen)?$', r'\1:v_procesu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_prospěch(:gen)?$', r'\1:ve_prospěch:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_protiklad_k(:dat)?$', r'\1:v_protikladu_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_průběh(:gen)?$', r'\1:v_průběhu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_případ(:gen)?$', r'\1:v_případě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rámec(:gen)?$', r'\1:v_rámci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rozpor_s(:ins)?$', r'\1:v_rozporu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_řada(:gen)?$', r'\1:v_řadě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_shoda_s(:ins)?$', r'\1:ve_shodě_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_služba(:gen)?$', r'\1:ve_službách:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr(:gen)?$', r'\1:ve_směru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr_k(:dat)?$', r'\1:ve_směru_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_smysl(:gen)?$', r'\1:ve_smyslu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_součinnost_s(:ins|:nom)?$', r'\1:v_součinnosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souhlas_s(:ins|:nom)?$', r'\1:v_souhlasu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_soulad_s(:ins|:nom)?$', r'\1:v_souladu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souvislost_s(:ins)?$', r'\1:v_souvislosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojení_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojený_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojitost_s(:ins)?$', r'\1:ve_spojitosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spolupráce_s(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_s_spolupráce(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_srovnání_se?(:ins)?$', r'\1:ve_srovnání_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_světlo(:gen)?$', r'\1:ve_světle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_věc(:gen)?$', r'\1:ve_věci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_vztah_k(:dat)?$', r'\1:ve_vztahu_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_zájem(:gen|:loc)?$', r'\1:v_zájmu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_záležitost(:gen)?$', r'\1:v_záležitosti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závěr(:gen)?$', r'\1:v_závěru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_na(:loc)?$', r'\1:v_závislosti_na:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_s(:ins)?$', r'\1:v_závislosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_znamení(:gen)?$', r'\1:ve_znamení:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vina(:gen)?$', r'\1:vinou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vliv(:gen)?$', r'\1:vlivem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vzhledem(_k)?(:dat)?$', r'\1:vzhledem_k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_důvod(:gen)?$', r'\1:z_důvodu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_hledisko(:gen|:nom)?$', r'\1:z_hlediska:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_oblast(:gen)?$', r'\1:z_oblasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_řada(:gen)?$', r'\1:z_řad:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?z_strana(:gen)?$', r'\1:ze_strany:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_nedostatek(:gen)?$', r'\1:z_nedostatku:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_titul(:gen)?$', r'\1:z_titulu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_pomoc(:gen)?$', r'\1:za_pomoci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účast(:gen)?$', r'\1:za_účasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účel(:gen)?$', r'\1:za_účelem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):začátek(:gen)?$', r'\1:začátkem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):zásluha(:gen)?$', r'\1:zásluhou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závěr(:gen)?$', r'\1:závěrem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závisle_na(:loc)?$', r'\1:nezávisle_na:loc', edep['deprel']) edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):že_za:gen$', r'\1:za:gen', edep['deprel']) def set_basic_and_enhanced(self, node, parent, deprel, edeprel): ''' From 613b26373af4fad9e28cc7186c82f1f171038901 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 9 Feb 2022 20:20:34 +0100 Subject: [PATCH 091/871] Manually synchronized fixedeprels.py in master with the changes done in gum-format. --- udapi/block/ud/cs/fixedeprels.py | 56 +++++++++++++++++++------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index ac2653c3..b3e551e5 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -55,6 +55,7 @@ class FixEdeprels(Block): 'kol': 'kolem:gen', 'kolem': 'kolem:gen', 'konec': 'koncem:gen', + 'krom': 'kromě:gen', 'kromě': 'kromě:gen', 'liž': 'li', 'mezi_uvnitř': 'uvnitř:gen', @@ -211,8 +212,10 @@ class FixEdeprels(Block): 'zatím_co': 'zatímco', 'závěr': 'závěrem:gen', 'závisle_na': 'nezávisle_na:loc', + 'že': 'že', # remove morphological case 'že_ať': 'ať', 'že_jako': 'že', + 'že_jakoby': 'že', 'že_za': 'za:gen' } @@ -223,14 +226,14 @@ def process_node(self, node): abbreviation and its morphological case is unknown. """ for edep in node.deps: - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) if m: bdeprel = m.group(1) solved = False for x in self.unambiguous: # All secondary prepositions have only one fixed morphological case # they appear with, so we can replace whatever case we encounter with the correct one. - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) if m: edep['deprel'] = m.group(1)+':'+self.unambiguous[x] solved = True @@ -253,19 +256,19 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):k$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:k$', r'obl:k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):k:dat$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:k:dat$', r'obl:k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' - edep['deprel'] = re.sub(r'^acl:na_způsob$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' + edep['deprel'] = re.sub(r'^acl:na_způsob:gen$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):od$', r'nmod:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):podle$', r'obl:podle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:pro$', r'obl:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):v$', r'nmod:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^acl:od:gen$', r'nmod:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:podle:gen$', r'obl:podle:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:pro:acc$', r'obl:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v_duch$', r'obl:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v_duchu:gen$', r'obl:v_duchu:gen', edep['deprel']) if edep['deprel'] == 'acl:v' and node.form == 'patře': edep['deprel'] = 'nmod:v:loc' node.deprel = 'nmod' @@ -310,6 +313,8 @@ def process_node(self, node): node.deprel = 'acl:relcl' elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): edep['deprel'] = 'advcl:li' + elif re.match(r'^(nmod|obl(:arg)?):mezi:voc$', edep['deprel']): + edep['deprel'] = re.sub(r':voc$', r':acc', edep['deprel']) elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: edep['deprel'] += ':acc' @@ -329,7 +334,8 @@ def process_node(self, node): edep['deprel'] = 'obl:za:acc' elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) - elif re.match(r'^(nmod|obl(:arg)?):na$', edep['deprel']): + elif re.match(r'^(nmod|obl(:arg)?):na(:gen)?$', edep['deprel']): + edep['deprel'] = re.sub(r':gen$', '', edep['deprel']) # The case is unknown. We need 'acc' or 'loc'. # The locative is probably more frequent but it is not so likely with every noun. # If there is an nummod:gov child, it must be accusative and not locative. @@ -398,7 +404,6 @@ def process_node(self, node): cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] if len(cnouns) > 0 and len(vs) > 0: - logging.info('I am here.') cnoun = cnouns[0] v = vs[0] self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') @@ -420,26 +425,33 @@ def process_node(self, node): # Instrumental would be possible but unlikely. edep['deprel'] += ':acc' else: - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky + # If one of the following expressions occurs followed by another preposition, + # remove the additional preposition. For example, 'i_když_s' becomes just 'i_když'. edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):než[_:].+$', r'\1:než', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):takže[_:].+$', r'\1:takže', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) + # + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_?l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) @@ -447,12 +459,10 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):takže[_:].+$', r'\1:takže', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem From c691804af00366f60af2cac8fd5823404e1f83bc Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 10 Feb 2022 10:22:33 +0100 Subject: [PATCH 092/871] GUM format support (#96) * global.Entity support * shortcuts: doc.coref_mentions and tree.document * reading and writing new (CorefUD 1.0) format of coreference * oops, bug in detecting discontinuous mentions * fix ordering of brackets in serialization for crossing mention spans * change CorefMention.__lt__, document rules for serialization If two mentions start at the same word, the longer must be saved first, in the new format. However, we cannot cycle through `reversed(doc.coref_mentions)` because that would break the ordering of closing brackets. The easiest solution seems to be to redefine `CorefMention.__lt__`, so that it follows the order in which mentions must be stored in the new format. * BridgingLinks string represenation now follows the new format but we can have multiple src mentions in a single `Bridge=` annotation, e.g. `Entity=(e5(e6|Bridge=e1 --- udapi/block/corefud/fixcorefud02.py | 56 ++ udapi/block/corefud/fixinterleaved.py | 82 +++ udapi/block/corefud/indexclusters.py | 3 +- udapi/block/corefud/markinterleaved.py | 45 ++ udapi/block/corefud/marksamesubspan.py | 45 ++ udapi/block/corefud/mergesamespan.py | 51 ++ udapi/block/corefud/movehead.py | 5 +- udapi/block/corefud/printclusters.py | 10 +- udapi/block/read/conllu.py | 14 +- udapi/block/read/oldcorefud.py | 52 +- udapi/block/write/conllu.py | 18 +- udapi/block/write/oldcorefud.py | 10 +- udapi/core/basereader.py | 3 + udapi/core/coref.py | 796 ++++++++++++++++++++----- udapi/core/document.py | 10 + udapi/core/dualdict.py | 2 +- udapi/core/root.py | 4 + udapi/core/run.py | 14 +- 18 files changed, 1046 insertions(+), 174 deletions(-) create mode 100644 udapi/block/corefud/fixcorefud02.py create mode 100644 udapi/block/corefud/fixinterleaved.py create mode 100644 udapi/block/corefud/markinterleaved.py create mode 100644 udapi/block/corefud/marksamesubspan.py create mode 100644 udapi/block/corefud/mergesamespan.py diff --git a/udapi/block/corefud/fixcorefud02.py b/udapi/block/corefud/fixcorefud02.py new file mode 100644 index 00000000..b8fe44f7 --- /dev/null +++ b/udapi/block/corefud/fixcorefud02.py @@ -0,0 +1,56 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +NEW_ETYPE = { + "misc": "other", + "date": "time", + "loc": "place", + "location": "place", + "per": "person", + "org": "organization", + "_": "", + } + +class FixCorefUD02(Block): + """Fix errors in CorefUD 0.2 for release of CorefUD 1.0.""" + + def process_document(self, doc): + # For GUM + if doc.meta['global.Entity'] == 'entity-GRP-infstat-MIN-coref_type-identity': + doc.meta['global.Entity'] = 'eid-etype-head-other-infstat-minspan-identity' + + for cluster in doc.coref_clusters.values(): + if cluster.cluster_type: + # Harmonize etype. + # If gen/spec is distinguished, store it in all mentions' other['gstype']. + etype = cluster.cluster_type.lower() + if etype.startswith('spec') or etype.startswith('gen'): + gstype = 'gen' if etype.startswith('gen') else 'spec' + for m in cluster.mentions: + m.other['gstype'] = gstype + if etype == 'spec': + etype = 'other' + etype = etype.replace('gen', '').replace('spec', '').replace('.', '') + etype = NEW_ETYPE.get(etype, etype) + + # cluster_type="APPOS" is used only in NONPUBL-CorefUD_English-OntoNotes. + # Apposition is a mention-based rather than cluster-based attribute. + # We don't know which of the mentions it should be assigned, but let's expect all non-first. + # UD marks appositions with deprel appos, so once someone checks it is really redunant, + # TODO we can delete the appos mention attribute. + if etype == 'appos': + etype = '' + for mention in cluster.mentions[1:]: + mention.other['appos'] = '1' + cluster.cluster_type = etype + + for mention in cluster.mentions: + # Harmonize bridge relation labels + for bridge in mention.bridging: + rel = bridge.relation.lower() + if rel.endswith('-inv'): + rel = 'i' + rel.replace('-inv', '') + rel = rel.replace('-', '') + rel = rel.replace('indirect_', '') + bridge.relation = rel diff --git a/udapi/block/corefud/fixinterleaved.py b/udapi/block/corefud/fixinterleaved.py new file mode 100644 index 00000000..6921c680 --- /dev/null +++ b/udapi/block/corefud/fixinterleaved.py @@ -0,0 +1,82 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class FixInterleaved(Block): + """Fix mentions with interleaved or crossing spans.""" + + def __init__(self, same_cluster_only=True, both_discontinuous=False, + crossing_only=False, nested_same_subspan=True, **kwargs): + super().__init__(**kwargs) + self.same_cluster_only = same_cluster_only + self.both_discontinuous = both_discontinuous + self.crossing_only = crossing_only + self.nested_same_subspan = nested_same_subspan + + def process_tree(self, tree): + mentions, deleted = set(), set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + for mA, mB in itertools.combinations(mentions, 2): + if mA in deleted or mB in deleted: + continue + if self.same_cluster_only and mA.cluster != mB.cluster: + continue + + # Fully nested spans are OK, expect for same-subspan + sA, sB = set(mA.words), set(mB.words) + if (sA <= sB) or (sB <= sA): + if not self.nested_same_subspan: + continue + elif not set(mA.span.split(',')).intersection(set(mB.span.split(','))): + continue + + # Crossing or interleaved+crossing? + elif self.crossing_only: + if not sA.intersection(sB): + continue + else: + if mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0]: + continue + if mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0]: + continue + + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + + mA.words = list(sA.union(sB)) + for wb in sB: + try: + wb._mentions.remove(mB) + except ValueError: + pass + try: + mB.cluster.mentions.remove(mB) + except ValueError: + pass + deleted.add(mB) + + # By changing the mA.words, we could have create another error: + # making the span same as another mention. Let's fix it + sA = set(mA.words) + for mC in mentions: + if mC in deleted or mC is mA or mC is mB: + continue + if sA != set(mC.words): + continue + # So mA and mC have the same span and we need to delete one of them to fix it. + # We will delete mA because it has the artificially enlarged span, + # while mC is from the original annotation. + for wa in sA: + try: + wa._mentions.remove(mA) + except ValueError: + pass + try: + mA.cluster.mentions.remove(mA) + except ValueError: + pass + break + deleted.add(mA) diff --git a/udapi/block/corefud/indexclusters.py b/udapi/block/corefud/indexclusters.py index dee45544..1496c11c 100644 --- a/udapi/block/corefud/indexclusters.py +++ b/udapi/block/corefud/indexclusters.py @@ -27,8 +27,7 @@ def process_document(self, doc): for idx, cid in enumerate(clusters, self.start): cluster = clusters[cid] new_cid = "c" + str(idx) - # need to change private variable - cluster._cluster_id = new_cid + cluster.cluster_id = new_cid new_clusters[new_cid] = cluster self.start = idx + 1 doc._coref_clusters = new_clusters diff --git a/udapi/block/corefud/markinterleaved.py b/udapi/block/corefud/markinterleaved.py new file mode 100644 index 00000000..ac4d9438 --- /dev/null +++ b/udapi/block/corefud/markinterleaved.py @@ -0,0 +1,45 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkInterleaved(Block): + """Find mentions with interleaved spans.""" + + def __init__(self, same_cluster_only=False, both_discontinuous=False, print_form=False, + log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_cluster_only = same_cluster_only + self.both_discontinuous = both_discontinuous + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.cluster.cluster_id + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.cluster.cluster_id + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + if len(mentions) > 1: + for mA, mB in itertools.combinations(mentions, 2): + if set(mA.words).intersection(set(mB.words)): + continue + if mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0]: + continue + if mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0]: + continue + if self.same_cluster_only and mA.cluster != mB.cluster: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"interleaved mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/marksamesubspan.py b/udapi/block/corefud/marksamesubspan.py new file mode 100644 index 00000000..f99e0e13 --- /dev/null +++ b/udapi/block/corefud/marksamesubspan.py @@ -0,0 +1,45 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkSameSubSpan(Block): + """Find mentions with the same subspan.""" + + def __init__(self, same_cluster_only=False, both_discontinuous=False, print_form=False, nested_only=False, + log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_cluster_only = same_cluster_only + self.both_discontinuous = both_discontinuous + self.nested_only = nested_only + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.cluster.cluster_id + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.cluster.cluster_id + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + if len(mentions) > 1: + for mA, mB in itertools.combinations(mentions, 2): + if self.same_cluster_only and mA.cluster != mB.cluster: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + sA, sB = set(mA.words), set(mB.words) + if self.nested_only and not (sA <= sB) and not (sB <= sA): + continue + if not set(mA.span.split(',')).intersection(set(mB.span.split(','))): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"same-subspan mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/mergesamespan.py b/udapi/block/corefud/mergesamespan.py new file mode 100644 index 00000000..d5a46d25 --- /dev/null +++ b/udapi/block/corefud/mergesamespan.py @@ -0,0 +1,51 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools +import logging + +class MergeSameSpan(Block): + """ + Multiple same-span mentions are considered invalid in CoNLL-U, whether they + belong to the same entity or not. If they occur, merge them into one. + Note: We currently do not have mentions across sentence boundaries in the + CorefUD data, so this block processes one sentence at a time. + """ + + def __init__(self, same_cluster_only=False, **kwargs): + super().__init__(**kwargs) + self.same_cluster_only = same_cluster_only + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + for mA, mB in itertools.combinations(mentions, 2): + if self.same_cluster_only and mA.cluster != mB.cluster: + continue + + sA, sB = set(mA.words), set(mB.words) + if sA != sB: + continue + + # If the mentions belong to different clusters, we should merge the + # clusters first, i.e., pick one cluster as the survivor, move the + # mentions from the other cluster to this cluster, and remove the + # other cluster. + if mA.cluster != mB.cluster: + logging.warning("Merging same-span mentions that belong to different entities: '%s' vs. '%s'." % (mA.cluster.cluster_id, mB.cluster.cluster_id)) + ###!!! TODO: As of now, changing the cluster of a mention is not supported in the API. + #for m in mB.cluster.mentions: + # m.cluster = mA.cluster + # Remove mention B. It may have been removed earlier because of + # another duplicate, that is the purpose of try-except. + for wb in sB: + try: + wb._mentions.remove(mB) + except ValueError: + pass + try: + mB.cluster.mentions.remove(mB) + except ValueError: + pass diff --git a/udapi/block/corefud/movehead.py b/udapi/block/corefud/movehead.py index e9034a22..2a38bd82 100644 --- a/udapi/block/corefud/movehead.py +++ b/udapi/block/corefud/movehead.py @@ -6,9 +6,10 @@ class MoveHead(Block): """Block corefud.MoveHead moves the head to the highest node in each mention.""" - def __init__(self, bugs='warn', **kwargs): + def __init__(self, bugs='warn', keep_head_if_possible=True, **kwargs): self.counter = Counter() self.bugs = bugs + self.keep_head_if_possible = keep_head_if_possible super().__init__(**kwargs) def _eparents(self, node): @@ -68,7 +69,7 @@ def find_head(self, mention): mention.head.misc['Bug'] = 'highest-head' # Fifth, try to convervatively preserve the original head, if it is one of the possible heads. - if mention.head in enh_heads: + if self.keep_head_if_possible and mention.head in enh_heads: return mention.head, 'nontreelet' # Finally, return the word-order-wise first head candidate as the head. diff --git a/udapi/block/corefud/printclusters.py b/udapi/block/corefud/printclusters.py index a9a03f5e..7271ae78 100644 --- a/udapi/block/corefud/printclusters.py +++ b/udapi/block/corefud/printclusters.py @@ -6,17 +6,20 @@ class PrintClusters(Block): """Block corefud.PrintClusters prints all mentions of a given cluster.""" - def __init__(self, id_re=None, min_mentions=0, print_ranges=True, aggregate_mentions=True, **kwargs): + def __init__(self, id_re=None, min_mentions=0, print_ranges=True, mark_head=True, + aggregate_mentions=True, **kwargs): """Params: id_re: regular expression constraining ClusterId of the clusters to be printed min_mentions: print only clusters with with at least N mentions print_ranges: print also addressess of all mentions (compactly, using the longest common prefix of sent_id) + mark_head: mark the head (e.g. as "red **car**") """ super().__init__(**kwargs) self.id_re = re.compile(str(id_re)) if id_re else None self.min_mentions = min_mentions self.print_ranges = print_ranges + self.mark_head = mark_head self.aggregate_mentions = aggregate_mentions def process_document(self, doc): @@ -32,7 +35,7 @@ def process_document(self, doc): counter = Counter() ranges = defaultdict(list) for mention in cluster.mentions: - forms = ' '.join([w.form for w in mention.words]) + forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words]) counter[forms] += 1 if self.print_ranges: ranges[forms].append(mention.head.root.address() + ':' +mention.span) @@ -46,6 +49,7 @@ def process_document(self, doc): print(f' {prefix} ({" ".join(f[len(prefix):] for f in ranges[form])})') else: for mention in cluster.mentions: - print(' ' + ' '.join([w.form for w in mention.words])) + forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words]) + print(' ' + forms) if self.print_ranges: print(f" {mention.head.root.address()}:{mention.span}") diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 71886752..97e39970 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -14,6 +14,7 @@ RE_TEXT = re.compile(r'^# text\s*=\s*(.*)') RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc)(?:\s+id\s*=\s*(.+))?') RE_JSON = re.compile(r'^# (doc_)?json_([^ =]+)\s*=\s*(.+)') +RE_GLOBAL_ENTITY = re.compile(r'^# global.Entity\s*=\s*(\S+)') class Conllu(BaseReader): @@ -33,8 +34,7 @@ def __init__(self, strict=False, empty_parent='warn', fix_cycles=False, **kwargs self.empty_parent = empty_parent self.fix_cycles = fix_cycles - @staticmethod - def parse_comment_line(line, root): + def parse_comment_line(self, line, root): """Parse one line of CoNLL-U and fill sent_id, text, newpar, newdoc in root.""" sent_id_match = RE_SENT_ID.match(line) if sent_id_match is not None: @@ -68,6 +68,16 @@ def parse_comment_line(line, root): container = root.json['__doc__'] container[json_match.group(2)] = json.loads(json_match.group(3)) return + + entity_match = RE_GLOBAL_ENTITY.match(line) + if entity_match is not None: + global_entity = entity_match.group(1) + if self._global_entity and self._global_entity != global_entity: + logging.warning("Mismatch in global.Entity: %s != %s", (self._global_entity, global_entity)) + self._global_entity = global_entity + root.comment += '$GLOBAL.ENTITY\n' + return + root.comment += line[1:] + "\n" def read_trees(self): diff --git a/udapi/block/read/oldcorefud.py b/udapi/block/read/oldcorefud.py index a7bc3101..539d5036 100644 --- a/udapi/block/read/oldcorefud.py +++ b/udapi/block/read/oldcorefud.py @@ -6,6 +6,33 @@ class OldCorefUD(udapi.block.read.conllu.Conllu): + def __init__(self, replace_hyphen_in_id_with='', **kwargs): + """Create the read.OldCorefUD reader object. + + Args: + substitute_hyphen_in_id_for: string to use as a replacement for hyphens in ClusterId + The new format does not allow hyphens in eid (IDs of entity clusters), + so we need to replace them. + """ + super().__init__(**kwargs) + self.replace_hyphen_in_id_with = replace_hyphen_in_id_with + self.orig2new = {} + self.new2orig = {} + + def _fix_id(self, cid): + if not cid or '-' not in cid: + return cid + new_cid = self.orig2new.get(cid) + if new_cid is None: + new_cid = cid.replace('-', self.replace_hyphen_in_id_with) + base, counter = new_cid, 1 + while new_cid in self.new2orig: + counter += 1 + new_cid = f"{base}{counter}" + self.new2orig[new_cid] = cid + self.orig2new[cid] = new_cid + return new_cid + def process_document(self, doc, strict=True): super().process_document(doc) @@ -16,25 +43,32 @@ def process_document(self, doc, strict=True): if not cluster_id: index, index_str = 1, "[1]" cluster_id = node.misc["ClusterId[1]"] + cluster_id = self._fix_id(cluster_id) while cluster_id: cluster = clusters.get(cluster_id) if cluster is None: cluster = CorefCluster(cluster_id) clusters[cluster_id] = cluster - mention = CorefMention(node, cluster) + mention = CorefMention(words=[node], cluster=cluster) if node.misc["MentionSpan" + index_str]: mention.span = node.misc["MentionSpan" + index_str] - else: - mention.words = [node] cluster_type = node.misc["ClusterType" + index_str] - if cluster_type is not None: + if cluster_type: if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") cluster.cluster_type = cluster_type bridging_str = node.misc["Bridging" + index_str] if bridging_str: - mention._bridging = BridgingLinks(mention, bridging_str, clusters, strict) + mention._bridging = BridgingLinks(mention) + for link_str in bridging_str.split(','): + target, relation = link_str.split(':') + target = self._fix_id(target) + if target == cluster_id: + _error("Bridging cannot self-reference the same cluster: " + target, strict) + if target not in clusters: + clusters[target] = CorefCluster(target) + mention._bridging.append((clusters[target], relation)) split_ante_str = node.misc["SplitAnte" + index_str] if split_ante_str: @@ -42,6 +76,7 @@ def process_document(self, doc, strict=True): # TODO in CorefUD draft "+" was used as the separator, but it was changed to comma. # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator. for ante_str in split_ante_str.replace('+', ',').split(','): + ante_str = self._fix_id(ante_str) if ante_str in clusters: if ante_str == cluster_id: _error("SplitAnte cannot self-reference the same cluster: " + cluster_id, strict) @@ -53,10 +88,13 @@ def process_document(self, doc, strict=True): split_antes.append(ante_cl) cluster.split_ante = sorted(split_antes) - mention.misc = node.misc["MentionMisc" + index_str] + # Some CorefUD 0.2 datasets (e.g. ARRAU) separate key-value pairs with spaces instead of commas. + # We also need to escape forbidden characters. + mmisc = node.misc["MentionMisc" + index_str].replace(' ', ',') + mention.other = mmisc.replace('-', '%2D').replace('(', '%28').replace(')', '%29') index += 1 index_str = f"[{index}]" - cluster_id = node.misc["ClusterId" + index_str] + cluster_id = self._fix_id(node.misc["ClusterId" + index_str]) # c=doc.coref_clusters should be sorted, so that c[0] < c[1] etc. # In other words, the dict should be sorted by the values (according to CorefCluster.__lt__), # not by the keys (cluster_id). diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index 66ae320b..abe20963 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -26,7 +26,7 @@ def process_tree(self, tree): # pylint: disable=too-many-branches # If tree.comment contains placeholders $NEWDOC,...$TEXT, replace them with the actual # value of the attribute and make note on which line (i_*) they were present. comment_lines = tree.comment.splitlines() - i_newdoc, i_newpar, i_sent_id, i_text = -1, -1, -1, -1 + i_newdoc, i_newpar, i_sent_id, i_text, i_global_entity = -1, -1, -1, -1, -1 for i, c_line in enumerate(comment_lines): if c_line == '$SENT_ID': i_sent_id = i @@ -50,6 +50,13 @@ def process_tree(self, tree): # pylint: disable=too-many-branches comment_lines[i] = ' newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '') else: comment_lines[i] = None + elif c_line == '$GLOBAL.ENTITY': + i_global_entity = i + ge = tree.document.meta.get('global.Entity') + if ge: + comment_lines[i] = ' global.Entity = ' + ge + else: + comment_lines[i] = None # Now print the special comments: global.columns, newdoc, newpar, sent_id and text. # If these comments were already present in tree.comment (as marked with the placeholders), @@ -68,6 +75,15 @@ def process_tree(self, tree): # pylint: disable=too-many-branches printed_i += 1 if comment_lines[printed_i]: print('#' + comment_lines[printed_i]) + ge = tree.document.meta.get('global.Entity') + if ge: + if i_global_entity == -1: + print('# global.Entity = ' + ge) + else: + while printed_i < i_global_entity: + printed_i += 1 + if comment_lines[printed_i]: + print('#' + comment_lines[printed_i]) if tree.newpar: if i_newpar == -1: print('# newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '')) diff --git a/udapi/block/write/oldcorefud.py b/udapi/block/write/oldcorefud.py index c6c38cbe..4eb316bb 100644 --- a/udapi/block/write/oldcorefud.py +++ b/udapi/block/write/oldcorefud.py @@ -6,9 +6,8 @@ class OldCorefUD(udapi.block.write.conllu.Conllu): def process_document(self, doc): - if not doc._coref_clusters: + if not doc.coref_clusters: logging.warning("Using write.OldCorefUD on a document without any coreference annotation") - doc._coref_clusters = {} # Delete both new-style (GUM-style) and old-style (CorefUD 0.1) coreference annotations from MISC. attrs = "Entity Split Bridge ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() @@ -16,6 +15,7 @@ def process_document(self, doc): for key in list(node.misc): if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): del node.misc[key] + del doc.meta['global.Entity'] # doc._coref_clusters is a dict, which is insertion ordered in Python 3.7+. # The insertion order is sorted according to CorefCluster.__lt__ (see few lines above). @@ -48,11 +48,11 @@ def process_document(self, doc): head.misc["MentionSpan" + index_str] = mention.span head.misc["ClusterType" + index_str] = cluster.cluster_type if mention._bridging: - head.misc["Bridging" + index_str] = str(mention.bridging) + head.misc["Bridging" + index_str] = ','.join(f'{l.target.cluster_id}:{l.relation}' for l in sorted(mention.bridging)) if cluster.split_ante: serialized = ','.join((c.cluster_id for c in sorted(cluster.split_ante))) head.misc["SplitAnte" + index_str] = serialized - if mention.misc: - head.misc["MentionMisc" + index_str] = mention.misc + if mention.other: + head.misc["MentionMisc" + index_str] = str(mention.other).replace('%2D', '-') super().process_document(doc) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 05f204b9..fee9da4c 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -28,6 +28,7 @@ def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, e logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id + self._global_entity = None @staticmethod def is_multizone_reader(): @@ -110,6 +111,7 @@ def try_fast_load(self, document): return False document.meta['loaded_from'] = self.filename + document.meta['global.Entity'] = self._global_entity if trees and trees[0].newdoc and trees[0].newdoc is not True: document.meta["docname"] = trees[0].newdoc @@ -187,6 +189,7 @@ def process_document(self, document): break if trees_loaded == 0: document.meta['loaded_from'] = self.filename + document.meta['global.Entity'] = self._global_entity add_to_the_last_bundle = False trees_loaded += 1 diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 6236e4cf..aaaa07f0 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -1,42 +1,172 @@ -"""Classes for handling coreference.""" +"""Classes for handling coreference. + +# CorefUD 1.0 format implementation details + +## Rules for ordering "chunks" within `node.misc['Entity']` +Entity mentions are annotated using "chunks" stored in `misc['Entity']`. +Chunks are of three types: +1. opening bracket, e.g. `(e1-person` +2. closing bracket, e.g. `e1-person)` +3. single-word span (both opening and closing), e.g. `(e1-person)` + +The `Entity` MISC attribute contains a sequence of chunks +without any separators, e.g. `Entity=(e1-person(e2-place)` +means opening `e1` mention and single-word `e2` mention +starting on a given node. + +### Crossing mentions +Two mentions are crossing iff their spans have non-empty intersection, +but neither is a subset of the other, e.g. `e1` spanning nodes 1-3 +and `e2` spanning 2-4 would be represented as: +``` +1 ... Entity=(e1 +2 ... Entity=(e2 +3 ... Entity=e1) +4 ... Entity=e2) +``` +This may be an annotation error and we may forbid such cases in future annotation guidelines, +but in CorefUD 0.2, there are thousands of such cases (see https://github.com/ufal/corefUD/issues/23). + +It can even happen that one entity ends and another starts at the same node: `Entity=e1)(e2` +For this reason, we need + +**Rule1**: closing brackets MUST always precede opening brackets. +Otherwise, we would get `Entity=(e2e1)`, which could not be parsed. + +Note that we cannot have same-entity crossing mentions in the CorefUD 1.0 format, +so e.g. if we substitute `e2` with `e1` in the example above, we'll get +`(e1`, `e1)`, `(e1`, `e1)`, which will be interpreted as two non-overlapping mentions of the same entity. + +### Nested mentions +One mention (span) can be often embedded within another mention (span). +It can happen that both these mentions correspond to the same entity (i.e. are in the same cluster), +for example, "` sold the world>`". +It can even happen that both mentions start at the same node, e.g. "`< w3>`" (TODO: find nice real-world examples). +In such cases, we need to make sure the brackets are well-nested: + +**Rule2**: when opening multiple brackets at the same node, longer mentions MUST be opened first. + +This is important because +- The closing bracket has the same form for both mentions of the same entity - it includes just the entity ID (`eid`). +- The opening-bracket annotation contains other mention attributes, e.g. head index. +- The two mentions may differ in these attributes, e.g. the "``" mention's head may be w3. +- When breaking Rule2, we would get +``` +1 w1 ... Entity=(e1-person-1(e1-person-3 +2 w2 ... Entity=e1) +3 w3 ... Entity=e1) +``` +which would be interpreted as if the head of the "``" mention is its third word, which is invalid. + +### Other rules + +**Rule3**: when closing multiple brackets at the same node, shorter mentions SHOULD be closed first. +See Rule4 for a single exception from this rule regarding crossing mentions. +I'm not aware of any problems when breaking this rule, but it seems intuitive +(to make the annotation well-nested if possible) and we want to define some canonical ordering anyway. +The API should be able to load even files breaking Rule3. + +**Rule4**: single-word chunks SHOULD follow all opening brackets and precede all closing brackets if possible. +When considering single-word chunks as a subtype of both opening and closing brackets, +this rule follows from the well-nestedness (and Rule2). +So we should have `Entity=(e1(e2)` and `Entity=(e3)e1)`, +but the API should be able to load even `Entity=(e2)(e1` and `Entity=e1)(e3)`. + +In case of crossing mentions (annotated following Rule1), we cannot follow Rule4. +If we want to add a single-word mention `e2` to a node with `Entity=e1)(e3`, +it seems intuitive to prefer Rule2 over Rule3, which results in `Entity=e1)(e3(e2)`. +So the canonical ordering will be achieved by placing single-word chunks after all opening brackets. +The API should be able to load even `Entity=(e2)e1)(e3` and `Entity=e1)(e2)(e3`. + +**Rule5**: ordering of same-span single-word mentions +TODO: I am not sure here. We may want to forbid such cases or define canonical ordering even for them. +E.g. `Entity=(e1)(e2)` vs. `Entity=(e2)(e1)`. + +**Rule6**: ordering of same-start same-end multiword mentions +TODO: I am not sure here. +These can be either same-span multiword mentions (which may be forbidden) +or something like +``` +1 w1 ... Entity=(e1(e2[1/2]) +2 w2 ... +3 w3 ... Entity=(e2[2/2])e1) +``` +where both `e1` and `e2` start at w1 and end at w3, but `e2` is discontinuous and does not contain w2. +If we interpret "shorter" and "longer" in Rule2 and Rule3 as `len(mention.words)` +(and not as `mention.words[-1].ord - mention.words[0].ord`), +we get the canonical ordering as in the example above. + +""" import re import functools import collections +import collections.abc +import copy import logging @functools.total_ordering class CorefMention(object): """Class for representing a mention (instance of an entity).""" - __slots__ = ['_head', '_cluster', '_bridging', '_words', 'misc'] + __slots__ = ['_head', '_cluster', '_bridging', '_words', '_other'] - def __init__(self, head, cluster=None): - self._head = head + def __init__(self, words, head=None, cluster=None): + if not words: + raise ValueError("mention.words must be non-empty") + self._words = words + self._head = head if head else words[0] self._cluster = cluster if cluster is not None: cluster._mentions.append(self) self._bridging = None - self._words = [] - self.misc = None + self._other = None - def __lt__(self, other): - """Does this mention precedes (word-order wise) the `other` mention? + def __lt__(self, another): + """Does this mention precedes (word-order wise) `another` mention? This method defines a total ordering of all mentions - (within one cluster or across different clusters). - The position is primarily defined by the first word in each mention - (or by the head if mention.words are missing). + (within one entity or across different entities). + The position is primarily defined by the first word in each mention. If two mentions start at the same word, - their order is defined by the last word in their span - -- the shorter mention precedes the longer one. + their order is defined by their length (i.e. number of words) + -- the shorter mention follows the longer one. + + In the rare case of two same-length mentions starting at the same word, but having different spans, + their order is defined by the order of the last word in their span. + For example precedes . + + The order of two same-span mentions is currently defined by their cluster_id. + There should be no same-span (or same-subspan) same-cluster mentions. """ - node1 = self._words[0] if self._words else self._head - node2 = other._words[0] if other._words else other._head - if node1 is node2: - node1 = self._words[-1] if self._words else self._head - node2 = other._words[-1] if other._words else other._head - if node1 is node2: - return len(self._words) < len(other._words) - return node1.precedes(node2) + #TODO: no mention.words should be handled already when loading + if not self._words: + self._words = [self._head] + if not another._words: + another._words = [another._head] + + if self._words[0] is another._words[0]: + if len(self._words) > len(another._words): + return True + if len(self._words) < len(another._words): + return False + if self._words[-1].precedes(another._words[-1]): + return True + if another._words[-1].precedes(self._words[-1]): + return False + return self._cluster.cluster_id < another._cluster.cluster_id + return self._words[0].precedes(another._words[0]) + + @property + def other(self): + if self._other is None: + self._other = OtherDualDict() + return self._other + + @other.setter + def other(self, value): + if self._other is None: + self._other = OtherDualDict(value) + else: + self._other.set_mapping(value) @property def head(self): @@ -74,7 +204,7 @@ def words(self): @words.setter def words(self, new_words): if new_words and self.head not in new_words: - raise ValueError(f"Head {self.head} not in new_words {new_words}") + raise ValueError(f"Head {self.head} not in new_words {new_words} for {self._cluster.cluster_id}") kept_words = [] for old_word in self._words: if old_word in new_words: @@ -100,19 +230,24 @@ def span(self, new_span): self.words = span_to_nodes(self._head.root, new_span) +CHARS_FORBIDDEN_IN_ID = "-=| \t()" + + @functools.total_ordering class CorefCluster(object): """Class for representing all mentions of a given entity.""" __slots__ = ['_cluster_id', '_mentions', 'cluster_type', 'split_ante'] def __init__(self, cluster_id, cluster_type=None): + if any(x in cluster_id for x in CHARS_FORBIDDEN_IN_ID): + raise ValueError(f"{cluster_id} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]") self._cluster_id = cluster_id self._mentions = [] self.cluster_type = cluster_type self.split_ante = [] - def __lt__(self, other): - """Does this CorefCluster precedes (word-order wise) the `other` cluster? + def __lt__(self, another): + """Does this CorefCluster precedes (word-order wise) `another` cluster? This method defines a total ordering of all clusters by the first mention of each cluster (see `CorefMention.__lt__`). @@ -121,18 +256,24 @@ def __lt__(self, other): If cluster IDs are not important, it is recommended to use block `corefud.IndexClusters` to re-name cluster IDs in accordance with this cluster ordering. """ - if not self._mentions or not other._mentions: + if not self._mentions or not another._mentions: # Clusters without mentions should go first, so the ordering is total. # If both clusters are missing mentions, let's use cluster_id, so the ordering is stable. - if not self._mentions and not other._mentions: - return self._cluster_id < other._cluster_id + if not self._mentions and not another._mentions: + return self._cluster_id < another._cluster_id return not self._mentions - return self._mentions[0] < other._mentions[0] + return self._mentions[0] < another._mentions[0] @property def cluster_id(self): return self._cluster_id + @cluster_id.setter + def cluster_id(self, new_cluster_id): + if any(x in new_cluster_id for x in "-=| \t"): + raise ValueError(f"{new_cluster_id} contains forbidden characters [-=| \\t]") + self._cluster_id = new_cluster_id + @property def mentions(self): return self._mentions @@ -181,7 +322,26 @@ def all_bridging(self): yield b -BridgingLink = collections.namedtuple('BridgingLink', 'target relation') +# BridgingLink +# Especially the relation should be mutable, so we cannot use +# BridgingLink = collections.namedtuple('BridgingLink', 'target relation') +# TODO once dropping support for Python 3.6, we could use +# from dataclasses import dataclass +# @dataclass +# class DataClassCard: +# target: CorefCluster +# relation: str +class BridgingLink: + __slots__ = ['target', 'relation'] + + def __init__(self, target, relation=''): + self.target = target + self.relation = '' if relation is None else relation + + def __lt__(self, another): + if self.target == another.target: + return self.relation < another.relation + return self.target < another.target class BridgingLinks(collections.abc.MutableSequence): @@ -189,33 +349,52 @@ class BridgingLinks(collections.abc.MutableSequence): Example usage: >>> bl = BridgingLinks(src_mention) # empty links - >>> bl = BridgingLinks(src_mention, [(c12, 'Part'), (c56, 'Subset')]) # from a list of tuples - >>> bl = BridgingLinks(src_mention, 'c12:Part,c56:Subset', clusters) # from a string + >>> bl = BridgingLinks(src_mention, [(c12, 'part'), (c56, 'subset')]) # from a list of tuples + >>> (bl8, bl9) = BridgingLinks.from_string('c12>> for cluster, relation in bl: >>> print(f"{bl.src_mention} ->{relation}-> {cluster.cluster_id}") - >>> print(str(bl)) # c12:Part,c56:Subset - >>> bl('Part').targets == [c12] - >>> bl('Part|Subset').targets == [c12, c56] - >>> bl.append((c89, 'Funct')) + >>> print(str(bl)) # c12>> bl('part').targets == [c12] + >>> bl('part|subset').targets == [c12, c56] + >>> bl.append((c57, 'funct')) """ - def __init__(self, src_mention, value=None, clusters=None, strict=True): + + @classmethod + def from_string(cls, string, clusters, strict=True): + src_str2bl = {} + for link_str in string.split(','): + try: + trg_str, src_str = link_str.split('<') + except ValueError as err: + _error(f"invalid Bridge {link_str} {err} at {node}", strict) + continue + relation = '' + if ':' in src_str: + src_str, relation = src_str.split(':', 1) + if trg_str == src_str: + _error("Bridge cannot self-reference the same cluster: " + trg_str, strict) + bl = src_str2bl.get(src_str) + if not bl: + bl = clusters[src_str].mentions[-1].bridging + src_str2bl[src_str] = bl + if trg_str not in clusters: + clusters[trg_str] = CorefCluster(trg_str) + bl._data.append(BridgingLink(clusters[trg_str], relation)) + return src_str2bl.values() + + def __init__(self, src_mention, value=None, strict=True): self.src_mention = src_mention self._data = [] self.strict = strict if value is not None: - if isinstance(value, str): - if clusters is None: - raise ValueError('BridgingClusters: clusters must be provided if initializing with a string') - try: - self._from_string(value, clusters) - except Exception: - logging.error(f"Problem when parsing {value} in {src_mention.words[0]}:\n") - raise - elif isinstance(value, collections.abc.Sequence): + if isinstance(value, collections.abc.Sequence): for v in value: if v[0] is src_mention._cluster: _error("Bridging cannot self-reference the same cluster: " + v[0].cluster_id, strict) self._data.append(BridgingLink(v[0], v[1])) + else: + raise ValueError(f"Unknown value type: {type(value)}") + self.src_mention._bridging = self super().__init__() def __getitem__(self, key): @@ -239,18 +418,8 @@ def insert(self, key, new_value): self._data.insert(key, BridgingLink(new_value[0], new_value[1])) def __str__(self): - return ','.join(f'{l.target._cluster_id}:{l.relation}' for l in sorted(self._data)) - - def _from_string(self, string, clusters): - self._data.clear() - for link_str in string.split(','): - target, relation = link_str.split(':') - if target == self.src_mention._cluster._cluster_id: - _error("Bridging cannot self-reference the same cluster: " + target, self.strict) - if target not in clusters: - clusters[target] = CorefCluster(target) - self._data.append(BridgingLink(clusters[target], relation)) - self._data.sort() + # TODO in future link.relation should never be None, 0 nor "_", so we could delete the below. + return ','.join(f'{l.target._cluster_id}<{self.src_mention.cluster.cluster_id}{":" + l.relation if l.relation not in (None, "_", "") else ""}' for l in sorted(self._data)) def __call__(self, relations_re=None): """Return a subset of links contained in this list as specified by the args. @@ -259,7 +428,7 @@ def __call__(self, relations_re=None): """ if relations_re is None: return self - return Links(self.src_mention, [l for l in self._data if re.match(relations_re, l.relation)]) + return BridgingLinks(self.src_mention, [l for l in self._data if re.match(relations_re, l.relation)]) @property def targets(self): @@ -286,112 +455,332 @@ def _error(msg, strict): raise ValueError(msg) logging.error(msg) + +RE_DISCONTINUOUS = re.compile(r'^([^[]+)\[(\d+)/(\d+)\]') + def load_coref_from_misc(doc, strict=True): clusters = {} + unfinished_mentions = collections.defaultdict(list) + discontinuous_mentions = collections.defaultdict(list) + global_entity = doc.meta.get('global.Entity') + was_global_entity = True + if not global_entity: + was_global_entity = False + global_entity = 'eid-etype-head-other' + doc.meta['global.Entity'] = global_entity + # backward compatibility + if global_entity == 'entity-GRP-infstat-MIN-coref_type-identity': + global_entity = 'etype-eid-infstat-minspan-link-identity' + # Which global.Entity should be used for serialization? + doc.meta['global.Entity'] = global_entity + #doc.meta['global.Entity'] = 'eid-etype-head-other' + if 'eid' not in global_entity: + raise ValueError("No eid in global.Entity = " + global_entity) + fields = global_entity.split('-') + for node in doc.nodes_and_empty: - index, index_str = 0, "" - cluster_id = node.misc["ClusterId"] - if not cluster_id: - index, index_str = 1, "[1]" - cluster_id = node.misc["ClusterId[1]"] - while cluster_id: - cluster = clusters.get(cluster_id) - if cluster is None: - cluster = CorefCluster(cluster_id) - clusters[cluster_id] = cluster - mention = CorefMention(node, cluster) - if node.misc["MentionSpan" + index_str]: - mention.span = node.misc["MentionSpan" + index_str] + misc_entity = node.misc["Entity"] + if not misc_entity: + continue + + if not was_global_entity: + raise ValueError(f"No global.Entity header found, but Entity= annotations are presents") + + # The Entity attribute may contain multiple entities, e.g. + # Entity=(abstract-7-new-2-coref(abstract-3-giv:act-1-coref) + # means a start of entity id=7 and start&end (i.e. single-word mention) of entity id=3. + # The following re.split line splits this into + # chunks = ["(abstract-7-new-2-coref", "(abstract-3-giv:act-1-coref)"] + chunks = [x for x in re.split('(\([^()]+\)?|[^()]+\))', misc_entity) if x] + for chunk in chunks: + opening, closing = (chunk[0] == '(', chunk[-1] == ')') + chunk = chunk.strip('()') + # 1. invalid + if not opening and not closing: + logging.warning(f"Entity {chunk} at {node} has no opening nor closing bracket.") + # 2. closing bracket + elif not opening and closing: + # closing brackets should include just the ID, + # but older GUM versions repeated all the fields + if '-' in chunk: + # TODO delete this legacy hack once we don't need to load UD GUM v2.8 anymore + if not strict and global_entity.startswith('etype-eid'): + chunk = chunk.split('-')[1] + else: + _error("Unexpected closing eid " + chunk, strict) + + # closing discontinuous mentions + eid, subspan_idx = chunk, None + if chunk not in unfinished_mentions: + m = RE_DISCONTINUOUS.match(chunk) + if not m: + raise ValueError(f"Mention {chunk} closed at {node}, but not opened.") + eid, subspan_idx, total_subspans = m.group(1, 2, 3) + + mention, head_idx = unfinished_mentions[eid].pop() + last_word = mention.words[-1] + if node.root is not last_word.root: + # TODO cross-sentence mentions + raise ValueError(f"Cross-sentence mentions not supported yet: {chunk} at {node}") + for w in node.root.descendants_and_empty: + if last_word.precedes(w): + mention._words.append(w) + w._mentions.append(mention) + if w is node: + break + if head_idx and (subspan_idx is None or subspan_idx == total_subspans): + try: + mention.head = mention.words[head_idx - 1] + except IndexError as err: + _error(f"Invalid head_idx={head_idx} for {mention.cluster.cluster_id} " + f"closed at {node} with words={mention.words}", 1) + if subspan_idx and subspan_idx == total_subspans: + m = discontinuous_mentions[eid].pop() + if m is not mention: + _error(f"Closing mention {mention.cluster.cluster_id} at {node}, but it has unfinished nested mentions ({m.words})", 1) + + # 3. opening or single-word else: - mention.words = [node] - cluster_type = node.misc["ClusterType" + index_str] - if cluster_type is not None: - if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: - logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") - cluster.cluster_type = cluster_type - - bridging_str = node.misc["Bridging" + index_str] - if bridging_str: - mention._bridging = BridgingLinks(mention, bridging_str, clusters, strict) - - split_ante_str = node.misc["SplitAnte" + index_str] - if split_ante_str: - split_antes = [] - # TODO in CorefUD draft "+" was used as the separator, but it was changed to comma. - # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator. - for ante_str in split_ante_str.replace('+', ',').split(','): - if ante_str in clusters: - if ante_str == cluster_id: - _error("SplitAnte cannot self-reference the same cluster: " + cluster_id, strict) - split_antes.append(clusters[ante_str]) + eid, etype, head_idx, other = None, None, None, OtherDualDict() + for name, value in zip(fields, chunk.split('-')): + if name == 'eid': + eid = value + elif name == 'etype': + etype = value + elif name == 'head': + try: + head_idx = int(value) + except ValueError as err: + raise ValueError(f"Non-integer {value} as head index in {chunk} in {node}: {err}") + elif name == 'other': + if other: + new_other = OtherDualDict(value) + for k,v in other.values(): + new_other[k] = v + other = new_other + else: + other = OtherDualDict(value) else: - # split cataphora, e.g. "We, that is you and me..." - ante_cl = CorefCluster(ante_str) - clusters[ante_str] = ante_cl - split_antes.append(ante_cl) - cluster.split_ante = sorted(split_antes) - - mention.misc = node.misc["MentionMisc" + index_str] - index += 1 - index_str = f"[{index}]" - cluster_id = node.misc["ClusterId" + index_str] + other[name] = value + if eid is None: + raise ValueError("No eid in " + chunk) + subspan_idx, total_subspans = None, '0' + if eid[-1] == ']': + m = RE_DISCONTINUOUS.match(eid) + if not m: + _error(f"eid={eid} ending with ], but not valid discontinuous mention ID ", strict) + else: + eid, subspan_idx, total_subspans = m.group(1, 2, 3) + + cluster = clusters.get(eid) + if cluster is None: + if subspan_idx and subspan_idx != '1': + _error(f'Non-first subspan of a discontinuous mention {eid} at {node} does not have any previous mention.', 1) + cluster = CorefCluster(eid) + clusters[eid] = cluster + cluster.cluster_type = etype + elif etype and cluster.cluster_type and cluster.cluster_type != etype: + logging.warning(f"etype mismatch in {node}: {cluster.cluster_type} != {etype}") + # CorefCluster could be created first with "Bridge=" without any type + elif etype and cluster.cluster_type is None: + cluster.cluster_type = etype + + if subspan_idx and subspan_idx != '1': + opened = [pair[0] for pair in unfinished_mentions[eid]] + mention = next(m for m in discontinuous_mentions[eid] if m not in opened) + mention._words.append(node) + if closing and subspan_idx == total_subspans: + m = discontinuous_mentions[eid].pop() + if m is not mention: + _error(f"{node}: closing mention {mention.cluster.cluster_id} ({mention.words}), but it has an unfinished nested mention ({m.words})", 1) + try: + mention.head = mention._words[head_idx - 1] + except IndexError as err: + _error(f"Invalid head_idx={head_idx} for {mention.cluster.cluster_id} " + f"closed at {node} with words={mention._words}", 1) + else: + mention = CorefMention(words=[node], cluster=cluster) + if other: + mention._other = other + if subspan_idx: + discontinuous_mentions[eid].append(mention) + node._mentions.append(mention) + + if not closing: + unfinished_mentions[eid].append((mention, head_idx)) + + + # Bridge, e.g. Entity=(e12-event|Bridge=e12 (e10) + # (e1(e2 --> (e1(e2(e10) + # e3)(e1(e2 --> e3)(e1(e2(e10) + if not orig_entity or orig_entity[-1] != ')': + firstword.misc['Entity'] += mention_str + ')' + # e4)e3) --> (e10)e4)e3) + elif '(' not in orig_entity: + firstword.misc['Entity'] = mention_str + ')' + orig_entity + # (e9)e4)e3) --> (e10)(e9)e4)e3) + elif any(c and c[0] == '(' and c[-1] != ')' for c in re.split('(\([^()]+\)?|[^()]+\))', orig_entity)): + firstword.misc['Entity'] += mention_str + ')' + # (e1(e2(e9) --> (e1(e2(e9)(e10) + # e3)(e1(e2(e9)--> e3)(e1(e2(e9)(e10) + else: + firstword.misc['Entity'] = mention_str + ')' + orig_entity + # Second, multi-word mentions. Opening brackets should follow closing brackets. + else: + firstword.misc['Entity'] += mention_str + mention.words[-1].misc['Entity'] = cluster.cluster_id + ')' + mention.words[-1].misc['Entity'] + + # Bridge=e1 lo else f"{lo}") return ','.join(ranges) + + +# TODO fix code duplication with udapi.core.dualdict after making sure benchmarks are not slower +class OtherDualDict(collections.abc.MutableMapping): + """OtherDualDict class serves as dict with lazily synchronized string representation. + + >>> ddict = OtherDualDict('anacata:anaphoric,antetype:entity,nptype:np') + >>> ddict['mention'] = 'np' + >>> str(ddict) + 'anacata:anaphoric,antetype:entity,mention:np,nptype:np' + >>> ddict['NonExistent'] + '' + + This class provides access to both + * a structured (dict-based, deserialized) representation, + e.g. {'anacata': 'anaphoric', 'antetype': 'entity'}, and + * a string (serialized) representation of the mapping, e.g. `anacata:anaphoric,antetype:entity`. + There is a clever mechanism that makes sure that users can read and write + both of the representations which are always kept synchronized. + Moreover, the synchronization is lazy, so the serialization and deserialization + is done only when needed. This speeds up scenarios where access to dict is not needed. + + A value can be deleted with any of the following three ways: + >>> del ddict['nptype'] + >>> ddict['nptype'] = None + >>> ddict['nptype'] = '' + and it works even if the value was already missing. + """ + __slots__ = ['_string', '_dict'] + + def __init__(self, value=None, **kwargs): + if value is not None and kwargs: + raise ValueError('If value is specified, no other kwarg is allowed ' + str(kwargs)) + self._dict = dict(**kwargs) + self._string = None + if value is not None: + self.set_mapping(value) + + def __str__(self): + if self._string is None: + serialized = [] + for name, value in sorted(self._dict.items(), key=lambda s: s[0].lower()): + if value is True: + serialized.append(name) + else: + serialized.append(f"{name}:{value}") + self._string = ','.join(serialized) if serialized else '' + return self._string + + def _deserialize_if_empty(self): + if not self._dict and self._string is not None and self._string != '': + for raw_feature in self._string.split(','): + namevalue = raw_feature.split(':', 1) + if len(namevalue) == 2: + name, value = namevalue + else: + name, value = namevalue[0], True + self._dict[name] = value + + def __getitem__(self, key): + self._deserialize_if_empty() + return self._dict.get(key, '') + + def __setitem__(self, key, value): + self._deserialize_if_empty() + self._string = None + if value is None or value == '': + self.__delitem__(key) + else: + value = value.replace(',', '%2C') # TODO report a warning? Escape also '|' and '-'? + self._dict[key] = value + + def __delitem__(self, key): + self._deserialize_if_empty() + try: + del self._dict[key] + self._string = None + except KeyError: + pass + + def __iter__(self): + self._deserialize_if_empty() + return self._dict.__iter__() + + def __len__(self): + self._deserialize_if_empty() + return len(self._dict) + + def __contains__(self, key): + self._deserialize_if_empty() + return self._dict.__contains__(key) + + def clear(self): + self._string = '_' + self._dict.clear() + + def copy(self): + """Return a deep copy of this instance.""" + return copy.deepcopy(self) + + def set_mapping(self, value): + """Set the mapping from a dict or string. + + If the `value` is None, it is converted to storing an empty string. + If the `value` is a string, it is stored as is. + If the `value` is a dict (or any instance of `collections.abc.Mapping`), + its copy is stored. + Other types of `value` raise an `ValueError` exception. + """ + if value is None: + self.clear() + elif isinstance(value, str): + self._dict.clear() + self._string = value + elif isinstance(value, collections.abc.Mapping): + self._string = None + self._dict = dict(value) + else: + raise ValueError("Unsupported value type " + str(value)) diff --git a/udapi/core/document.py b/udapi/core/document.py index 8f9ce3ea..aceeafdf 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -123,3 +123,13 @@ def coref_clusters(self): """A dict mapping ClusterId to a CorefCluster object.""" self._load_coref() return self._coref_clusters + + @property + def coref_mentions(self): + """A sorted list of all CorefMention objects in the document.""" + self._load_coref() + all_mentions = [] + for cluster in self._coref_clusters.values(): + all_mentions.extend(cluster.mentions) + all_mentions.sort() + return all_mentions diff --git a/udapi/core/dualdict.py b/udapi/core/dualdict.py index a79c0610..540006ea 100644 --- a/udapi/core/dualdict.py +++ b/udapi/core/dualdict.py @@ -45,7 +45,7 @@ def __str__(self): if value is True: serialized.append(name) else: - serialized.append('%s=%s' % (name, value)) + serialized.append(f"{name}:{value}") self._string = '|'.join(serialized) if serialized else '_' return self._string diff --git a/udapi/core/root.py b/udapi/core/root.py index 3fbe5fca..0132566a 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -71,6 +71,10 @@ def address(self): """ return self.sent_id + @property + def document(self): + return self._bundle._document + @property def bundle(self): """Return the bundle which this tree belongs to.""" diff --git a/udapi/core/run.py b/udapi/core/run.py index 0a08504c..c3a4ca6f 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -101,7 +101,7 @@ def _import_blocks(block_names, block_args): command = "b%s(**kwargs)" % block_id logging.debug("Trying to evaluate this: %s", command) new_block_instance = eval(command) # pylint: disable=eval-used - blocks.append(new_block_instance) + blocks.append((block_name, new_block_instance)) return blocks @@ -133,11 +133,11 @@ def execute(self): blocks = _import_blocks(block_names, block_args) # Initialize blocks (process_start). - for block in blocks: + for bname, block in blocks: block.process_start() readers = [] - for block in blocks: + for bname, block in blocks: try: block.finished # pylint: disable=pointless-statement readers.append(block) @@ -147,15 +147,15 @@ def execute(self): logging.info('No reader specified, using read.Conllu') conllu_reader = Conllu() readers = [conllu_reader] - blocks = readers + blocks + blocks = [('read.Conllu', conllu_reader)] + blocks # Apply blocks on the data. finished = False while not finished: document = Document() logging.info(" ---- ROUND ----") - for block in blocks: - logging.info("Executing block " + block.__class__.__name__) + for bname, block in blocks: + logging.info(f"Executing block {bname}") block.apply_on_document(document) finished = True @@ -164,7 +164,7 @@ def execute(self): finished = finished and reader.finished # 6. close blocks (process_end) - for block in blocks: + for bname, block in blocks: block.process_end() # TODO: better implementation, included Scen From 276529ca0daa4dbbfd8e74f87592028b8ecb88bc Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 10 Feb 2022 11:00:18 +0100 Subject: [PATCH 093/871] oops, partial revert of the last commit --- udapi/core/dualdict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/dualdict.py b/udapi/core/dualdict.py index 540006ea..ba0129ed 100644 --- a/udapi/core/dualdict.py +++ b/udapi/core/dualdict.py @@ -45,7 +45,7 @@ def __str__(self): if value is True: serialized.append(name) else: - serialized.append(f"{name}:{value}") + serialized.append(f"{name}={value}") self._string = '|'.join(serialized) if serialized else '_' return self._string From 81a65bf8a192aaf2bfdb9fa233b71b5a1c268d8f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 10 Feb 2022 12:24:15 +0100 Subject: [PATCH 094/871] corefud.MarkNested in future, I would like to merge all the corefud.Mark* blocks into one universal block, but for now, let's archive this one --- udapi/block/corefud/marknested.py | 44 +++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 udapi/block/corefud/marknested.py diff --git a/udapi/block/corefud/marknested.py b/udapi/block/corefud/marknested.py new file mode 100644 index 00000000..656111c6 --- /dev/null +++ b/udapi/block/corefud/marknested.py @@ -0,0 +1,44 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkNested(Block): + """Find nested mentions.""" + + def __init__(self, same_cluster_only=True, both_discontinuous=False, multiword_only=False, + print_form=False, log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_cluster_only = same_cluster_only + self.both_discontinuous = both_discontinuous + self.multiword_only = multiword_only + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.cluster.cluster_id + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.cluster.cluster_id + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + for mA, mB in itertools.combinations(mentions, 2): + if self.same_cluster_only and mA.cluster != mB.cluster: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + sA, sB = set(mA.words), set(mB.words) + if not (sA <= sB) and not (sB <= sA): + continue + if self.multiword_only and (len(sA) == 1 or len(sB) == 1): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"nested mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") From b38e7e4312d373fc28995ca37ca7ce2d25363f8c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 10 Feb 2022 14:54:07 +0100 Subject: [PATCH 095/871] CorefMention(words=[w1,w2]) should create backlinks from w1 and w2 The only case when we don't want these backlinks is "fake mentions" needed for serialization of discontinuous mentions, but that should be solved with a special parameter in `__init__`. Fixes #101 --- udapi/core/coref.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index aaaa07f0..3f54d9a9 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -109,16 +109,23 @@ class CorefMention(object): """Class for representing a mention (instance of an entity).""" __slots__ = ['_head', '_cluster', '_bridging', '_words', '_other'] - def __init__(self, words, head=None, cluster=None): + def __init__(self, words, head=None, cluster=None, add_word_backlinks=True): if not words: raise ValueError("mention.words must be non-empty") - self._words = words self._head = head if head else words[0] self._cluster = cluster if cluster is not None: cluster._mentions.append(self) self._bridging = None self._other = None + self._words = words + if add_word_backlinks: + for new_word in words: + if not new_word._mentions or not cluster or self > new_word._mentions[-1]: + new_word._mentions.append(self) + else: + new_word._mentions.append(self) + new_word._mentions.sort() def __lt__(self, another): """Does this mention precedes (word-order wise) `another` mention? @@ -692,7 +699,7 @@ def store_coref_to_misc(doc): subspan_eid = f'{cluster.cluster_id}[{idx}/{len(subspans)}]' subspan_words = span_to_nodes(root, subspan) fake_cluster = CorefCluster(subspan_eid, cluster.cluster_type) - fake_mention = CorefMention(subspan_words, head_str, fake_cluster) + fake_mention = CorefMention(subspan_words, head_str, fake_cluster, add_word_backlinks=False) if mention._other: fake_mention._other = mention._other if mention._bridging and idx == 1: From fe4dfcf110dac83608af2d3f8ad944840a0aee1c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 11 Feb 2022 01:30:46 +0100 Subject: [PATCH 096/871] corefud.IndexClusters will use prefix=e by default --- udapi/block/corefud/indexclusters.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/udapi/block/corefud/indexclusters.py b/udapi/block/corefud/indexclusters.py index 1496c11c..14cf778d 100644 --- a/udapi/block/corefud/indexclusters.py +++ b/udapi/block/corefud/indexclusters.py @@ -3,7 +3,7 @@ class IndexClusters(Block): - """Re-index the coreference cluster IDs. The final cluster IDs are of the "c" form, + """Re-index the coreference cluster IDs. The final cluster IDs are of the "e" form, where are ordinal numbers starting from the one specified by the `start` parameter. This block can be applied on multiple documents within one udapy call. For example, to re-index ClusterId in all conllu files in the current directory @@ -13,11 +13,14 @@ class IndexClusters(Block): Parameters: ----------- start : int - the starting index (by default 1) + the starting index (default=1) + prefix : str + prefix of the IDs before the number (default="e") """ - def __init__(self, start=1): + def __init__(self, start=1, prefix='e'): self.start = start + self.prefix = prefix def process_document(self, doc): clusters = doc.coref_clusters @@ -26,7 +29,7 @@ def process_document(self, doc): new_clusters = {} for idx, cid in enumerate(clusters, self.start): cluster = clusters[cid] - new_cid = "c" + str(idx) + new_cid = self.prefix + str(idx) cluster.cluster_id = new_cid new_clusters[new_cid] = cluster self.start = idx + 1 From ecee32956bc33cdbf20944768f9f9b059340edd8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 11 Feb 2022 10:15:37 +0100 Subject: [PATCH 097/871] prevent code duplication when setting cluster IDs --- udapi/core/coref.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 3f54d9a9..022953a4 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -246,9 +246,8 @@ class CorefCluster(object): __slots__ = ['_cluster_id', '_mentions', 'cluster_type', 'split_ante'] def __init__(self, cluster_id, cluster_type=None): - if any(x in cluster_id for x in CHARS_FORBIDDEN_IN_ID): - raise ValueError(f"{cluster_id} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]") - self._cluster_id = cluster_id + self._cluster_id = None # prepare the _cluster_id slot + self.cluster_id = cluster_id # call the setter and check the ID is valid self._mentions = [] self.cluster_type = cluster_type self.split_ante = [] @@ -277,8 +276,8 @@ def cluster_id(self): @cluster_id.setter def cluster_id(self, new_cluster_id): - if any(x in new_cluster_id for x in "-=| \t"): - raise ValueError(f"{new_cluster_id} contains forbidden characters [-=| \\t]") + if any(x in new_cluster_id for x in CHARS_FORBIDDEN_IN_ID): + raise ValueError(f"{new_cluster_id} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]") self._cluster_id = new_cluster_id @property From ee89d8fbd37dee421ebbbc1d64ecfd7bbc604eab Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 11 Feb 2022 16:27:33 +0100 Subject: [PATCH 098/871] newpar_block (used in the newest GUM) should not be treated as newpar --- udapi/block/read/conllu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 97e39970..d703fb26 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -12,7 +12,7 @@ # This reader accepts also older-style sent_id (until UD v2.0 treebanks are released). RE_SENT_ID = re.compile(r'^# sent_id\s*=?\s*(\S+)') RE_TEXT = re.compile(r'^# text\s*=\s*(.*)') -RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc)(?:\s+id\s*=\s*(.+))?') +RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc)(?:\s+id\s*=\s*(.+))?$') RE_JSON = re.compile(r'^# (doc_)?json_([^ =]+)\s*=\s*(.+)') RE_GLOBAL_ENTITY = re.compile(r'^# global.Entity\s*=\s*(\S+)') From eab9a7890e37924d1a402eac0ddb99562f143f35 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 11 Feb 2022 20:16:15 +0100 Subject: [PATCH 099/871] convert (doc-level) GRP to (corpus-level) eid transparently So GRP can be in global.Entity, both for reading and writing. --- udapi/core/coref.py | 100 +++++++++++++++++++++++++++++++++----------- 1 file changed, 76 insertions(+), 24 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 022953a4..7b205d3c 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -280,6 +280,18 @@ def cluster_id(self, new_cluster_id): raise ValueError(f"{new_cluster_id} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]") self._cluster_id = new_cluster_id + @property + def eid_or_grp(self): + root = self._mentions[0].head.root + meta = root.document.meta + if 'GRP' in meta['global.Entity'] and meta['tree2docid']: + docid = meta['tree2docid'][root] + if self._cluster_id.startswith(docid): + return self._cluster_id.replace(docid, '', 1) + else: + logging.warning(f"GRP in global.Entity, but eid={self._cluster_id} does not start with docid={docid}") + return self._cluster_id + @property def mentions(self): return self._mentions @@ -366,7 +378,14 @@ class BridgingLinks(collections.abc.MutableSequence): """ @classmethod - def from_string(cls, string, clusters, strict=True): + def from_string(cls, string, clusters, node, strict=True, tree2docid=None): + """Return a sequence of BridgingLink objects representing a given string serialization. + The bridging links are also added to the mentions (`mention.bridging`) in the supplied `clusters`, + so the returned sequence can be usually ignored. + If `tree2docid` parameter is provided (mapping trees to document IDs used as prefixes in eid), + the entity IDs in the provided string are interpreted as "GRP", i.e. as document-wide IDs, + which need to be prefixed by the document IDs, to get corpus-wide unique "eid". + """ src_str2bl = {} for link_str in string.split(','): try: @@ -378,7 +397,10 @@ def from_string(cls, string, clusters, strict=True): if ':' in src_str: src_str, relation = src_str.split(':', 1) if trg_str == src_str: - _error("Bridge cannot self-reference the same cluster: " + trg_str, strict) + _error(f"Bridge cannot self-reference the same cluster {trg_str} at {node}", strict) + if tree2docid: + src_str = tree2docid[node.root] + src_str + trg_str = tree2docid[node.root] + trg_str bl = src_str2bl.get(src_str) if not bl: bl = clusters[src_str].mentions[-1].bridging @@ -425,7 +447,7 @@ def insert(self, key, new_value): def __str__(self): # TODO in future link.relation should never be None, 0 nor "_", so we could delete the below. - return ','.join(f'{l.target._cluster_id}<{self.src_mention.cluster.cluster_id}{":" + l.relation if l.relation not in (None, "_", "") else ""}' for l in sorted(self._data)) + return ','.join(f'{l.target.eid_or_grp}<{self.src_mention.cluster.eid_or_grp}{":" + l.relation if l.relation not in (None, "_", "") else ""}' for l in sorted(self._data)) def __call__(self, relations_re=None): """Return a subset of links contained in this list as specified by the args. @@ -463,8 +485,14 @@ def _error(msg, strict): RE_DISCONTINUOUS = re.compile(r'^([^[]+)\[(\d+)/(\d+)\]') +# When converting doc-level GRP IDs to corpus-level eid IDs, +# we need to assign each document a short ID/number (document names are too long). +# These document numbers must be unique even when loading multiple files, +# so we need to store the highest number generated so far here, at the Python module level. +highest_doc_n = 0 def load_coref_from_misc(doc, strict=True): + global highest_doc_n clusters = {} unfinished_mentions = collections.defaultdict(list) discontinuous_mentions = collections.defaultdict(list) @@ -474,13 +502,17 @@ def load_coref_from_misc(doc, strict=True): was_global_entity = False global_entity = 'eid-etype-head-other' doc.meta['global.Entity'] = global_entity - # backward compatibility - if global_entity == 'entity-GRP-infstat-MIN-coref_type-identity': - global_entity = 'etype-eid-infstat-minspan-link-identity' - # Which global.Entity should be used for serialization? - doc.meta['global.Entity'] = global_entity - #doc.meta['global.Entity'] = 'eid-etype-head-other' - if 'eid' not in global_entity: + tree2docid = None + if 'GRP' in global_entity: + tree2docid, docid = {}, "" + for bundle in doc: + for tree in bundle: + if tree.newdoc or docid == "": + highest_doc_n += 1 + docid = f"d{highest_doc_n}." + tree2docid[tree] = docid + doc.meta['tree2docid'] = tree2docid + elif 'eid' not in global_entity: raise ValueError("No eid in global.Entity = " + global_entity) fields = global_entity.split('-') @@ -506,14 +538,15 @@ def load_coref_from_misc(doc, strict=True): logging.warning(f"Entity {chunk} at {node} has no opening nor closing bracket.") # 2. closing bracket elif not opening and closing: - # closing brackets should include just the ID, - # but older GUM versions repeated all the fields - if '-' in chunk: + # closing brackets should include just the ID, but GRP needs to be converted to eid + if tree2docid: # TODO delete this legacy hack once we don't need to load UD GUM v2.8 anymore - if not strict and global_entity.startswith('etype-eid'): - chunk = chunk.split('-')[1] - else: - _error("Unexpected closing eid " + chunk, strict) + if '-' in chunk: + if not strict and global_entity.startswith('entity-GRP'): + chunk = chunk.split('-')[1] + else: + _error("Unexpected closing eid " + chunk, strict) + chunk = tree2docid[node.root] + chunk # closing discontinuous mentions eid, subspan_idx = chunk, None @@ -551,7 +584,9 @@ def load_coref_from_misc(doc, strict=True): for name, value in zip(fields, chunk.split('-')): if name == 'eid': eid = value - elif name == 'etype': + elif name == 'GRP': + eid = tree2docid[node.root] + value + elif name == 'etype' or name == 'entity': # entity is an old name for etype used in UD GUM 2.8 and 2.9 etype = value elif name == 'head': try: @@ -617,10 +652,10 @@ def load_coref_from_misc(doc, strict=True): # Bridge, e.g. Entity=(e12-event|Bridge=e12 Date: Tue, 15 Feb 2022 13:40:09 +0100 Subject: [PATCH 100/871] make sure mention.words are sorted even when reordering/deleting nodes --- udapi/core/coref.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 7b205d3c..cb865a31 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -206,6 +206,11 @@ def bridging(self): @property def words(self): + # Words in a sentence could have been reordered, so we cannot rely on sorting self._words in the setter. + # The serialization relies on storing the opening bracket in the first word (and closing in the last), + # so we need to make sure the words are always returned sorted. + # TODO: benchmark updating the order of mention._words in node.shift_*() and node.remove(). + self._words.sort() return self._words @words.setter @@ -213,12 +218,13 @@ def words(self, new_words): if new_words and self.head not in new_words: raise ValueError(f"Head {self.head} not in new_words {new_words} for {self._cluster.cluster_id}") kept_words = [] + # Make sure each word is included just once and they are in the correct order. + new_words = sorted(list(set(new_words))) for old_word in self._words: if old_word in new_words: kept_words.append(old_word) else: old_word._mentions.remove(self) - new_words.sort() self._words = new_words for new_word in new_words: if new_word not in kept_words: @@ -556,7 +562,10 @@ def load_coref_from_misc(doc, strict=True): raise ValueError(f"Mention {chunk} closed at {node}, but not opened.") eid, subspan_idx, total_subspans = m.group(1, 2, 3) - mention, head_idx = unfinished_mentions[eid].pop() + try: + mention, head_idx = unfinished_mentions[eid].pop() + except IndexError as err: + raise ValueError(f"Mention {chunk} closed at {node}, but not opened.") last_word = mention.words[-1] if node.root is not last_word.root: # TODO cross-sentence mentions From 7772c398998af5c25e89c12f11da26099d38d1bb Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 15 Feb 2022 21:57:47 +0100 Subject: [PATCH 101/871] Debugging corefud.MergeSameSpan. --- udapi/block/corefud/mergesamespan.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/udapi/block/corefud/mergesamespan.py b/udapi/block/corefud/mergesamespan.py index d5a46d25..802285af 100644 --- a/udapi/block/corefud/mergesamespan.py +++ b/udapi/block/corefud/mergesamespan.py @@ -24,6 +24,12 @@ def process_tree(self, tree): for mA, mB in itertools.combinations(mentions, 2): if self.same_cluster_only and mA.cluster != mB.cluster: continue + # Reduce non-determinism in which mention is removed: + # If the mentions belong to different entities, sort them by entity (cluster) ids. + if mA.cluster.cluster_id > mB.cluster.cluster_id: + mX = mA + mA = mB + mB = mX sA, sB = set(mA.words), set(mB.words) if sA != sB: @@ -40,6 +46,7 @@ def process_tree(self, tree): # m.cluster = mA.cluster # Remove mention B. It may have been removed earlier because of # another duplicate, that is the purpose of try-except. + ###!!! TODO: If we remove a singleton, we are destroying the cluster. Then we must also handle possible bridging and split antecedents pointing to that cluster! for wb in sB: try: wb._mentions.remove(mB) From 90ac47a28aae22ba5ae946698b0ca5e1a0ba3e53 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 16 Feb 2022 02:59:03 +0100 Subject: [PATCH 102/871] don't store Bridge or SplitAnte links to already deleted clusters fixes #102 A better solution would be to delete links to deleted clusters immediately, but it is tricky to make this fast (we would need backlinks) and in some scenarios we need links to clusters without any mentions (e.g. when loading a file with cataphora SplitAnte/Bridge). --- udapi/core/coref.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index cb865a31..9e3e97ee 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -469,6 +469,13 @@ def targets(self): """Return a list of the target clusters (without relations).""" return [link.target for link in self._data] + def _delete_targets_without_mentions(self, warn=True): + for link in self._data: + if not link.target.mentions: + if warn: + logging.warning(f"Cluster {link.target.cluster_id} has no mentions, but is referred to in bridging of {self.src_mention.cluster.cluster_id}") + self._data.remove(link) + def create_coref_cluster(head, cluster_id=None, cluster_type=None, **kwargs): clusters = head.root.bundle.document.coref_clusters @@ -832,6 +839,7 @@ def store_coref_to_misc(doc): # Bridge=e1 Date: Wed, 16 Feb 2022 03:02:11 +0100 Subject: [PATCH 103/871] `mention.words = []` takes care about deleting backlinks from words --- udapi/block/corefud/mergesamespan.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/udapi/block/corefud/mergesamespan.py b/udapi/block/corefud/mergesamespan.py index 802285af..bdeefd7c 100644 --- a/udapi/block/corefud/mergesamespan.py +++ b/udapi/block/corefud/mergesamespan.py @@ -27,9 +27,7 @@ def process_tree(self, tree): # Reduce non-determinism in which mention is removed: # If the mentions belong to different entities, sort them by entity (cluster) ids. if mA.cluster.cluster_id > mB.cluster.cluster_id: - mX = mA - mA = mB - mB = mX + mA, mB = mB, mA sA, sB = set(mA.words), set(mB.words) if sA != sB: @@ -40,18 +38,14 @@ def process_tree(self, tree): # mentions from the other cluster to this cluster, and remove the # other cluster. if mA.cluster != mB.cluster: - logging.warning("Merging same-span mentions that belong to different entities: '%s' vs. '%s'." % (mA.cluster.cluster_id, mB.cluster.cluster_id)) + logging.warning(f"Merging same-span mentions that belong to different entities: {mA.cluster.cluster_id} vs. {mB.cluster.cluster_id}") ###!!! TODO: As of now, changing the cluster of a mention is not supported in the API. #for m in mB.cluster.mentions: # m.cluster = mA.cluster # Remove mention B. It may have been removed earlier because of # another duplicate, that is the purpose of try-except. ###!!! TODO: If we remove a singleton, we are destroying the cluster. Then we must also handle possible bridging and split antecedents pointing to that cluster! - for wb in sB: - try: - wb._mentions.remove(mB) - except ValueError: - pass + mB.words = [] try: mB.cluster.mentions.remove(mB) except ValueError: From f7e82090b443c7fd59557255e30e6fd5a6fc30e7 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 22 Feb 2022 16:10:25 +0100 Subject: [PATCH 104/871] Russian block cloned from Czech. --- udapi/block/ud/ru/fixedeprels.py | 113 +++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 udapi/block/ud/ru/fixedeprels.py diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py new file mode 100644 index 00000000..54076b68 --- /dev/null +++ b/udapi/block/ud/ru/fixedeprels.py @@ -0,0 +1,113 @@ +"""Block to fix case-enhanced dependency relations in Russian.""" +from udapi.core.block import Block +import logging +import re + +class FixEdeprels(Block): + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'как': 'как' # remove morphological case + } + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Russian basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. + if not solved: + m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True + if re.match(r'^(acl|advcl):', edep['deprel']): + edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):k:dat$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:(od|do)$', r'obl:\1:gen', edep['deprel']) + elif re.match(r'^(nmod|obl):', edep['deprel']): + if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': + # This is a same-case noun-noun modifier, which just happens to be in the locative. + # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has + # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. + edep['deprel'] = 'nmod' + elif edep['deprel'] == 'obl:loc': + # Annotation error. The first occurrence in PDT dev: + # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' + # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. + # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. + edep['deprel'] = 'obl:v:loc' + elif edep['deprel'] == 'obl:arg:loc': + # Annotation error. The first occurrence in PDT dev: + edep['deprel'] = 'obl:arg:na:loc' + elif edep['deprel'] == 'nmod:loc': + # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': + # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? + # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. + edep['deprel'] = 'obl' + elif edep['deprel'] == 'nmod:voc': + # 'v 8. čísle tiskoviny Ty rudá krávo' + edep['deprel'] = 'nmod:nom' + elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + else: + # If one of the following expressions occurs followed by another preposition, + # remove the additional preposition. For example, 'i_když_s' becomes just 'i_když'. + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) From c673bd5e90bc64f0c4f3db68e2e424ddcdb94567 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 22 Feb 2022 16:20:37 +0100 Subject: [PATCH 105/871] Removed some Czech-specific rules. --- udapi/block/ud/ru/fixedeprels.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 54076b68..48febee5 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -49,39 +49,16 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):k:dat$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:(od|do)$', r'obl:\1:gen', edep['deprel']) elif re.match(r'^(nmod|obl):', edep['deprel']): if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': # This is a same-case noun-noun modifier, which just happens to be in the locative. # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. edep['deprel'] = 'nmod' - elif edep['deprel'] == 'obl:loc': - # Annotation error. The first occurrence in PDT dev: - # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' - # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. - # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. - edep['deprel'] = 'obl:v:loc' - elif edep['deprel'] == 'obl:arg:loc': - # Annotation error. The first occurrence in PDT dev: - edep['deprel'] = 'obl:arg:na:loc' elif edep['deprel'] == 'nmod:loc': - # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. edep['deprel'] = 'nmod:nom' - elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': - # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? - # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. - edep['deprel'] = 'obl' elif edep['deprel'] == 'nmod:voc': - # 'v 8. čísle tiskoviny Ty rudá krávo' edep['deprel'] = 'nmod:nom' - elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): - if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' else: # If one of the following expressions occurs followed by another preposition, # remove the additional preposition. For example, 'i_když_s' becomes just 'i_když'. From ff0d5c915064a5b07977fe15cdc1858bf49fe786 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 22 Feb 2022 17:35:09 +0100 Subject: [PATCH 106/871] Circleci project setup (#103) * Add .circleci/config.yml * CircleCI debugging * CircleCI debug * regexes need \r"" * allow len(document) Users may expect this to work, when document[i] works. * reader.read_documents() * add a comment explaining the hack from #96 * add a first test for coreference API * fix the bug revealed in test_coref.py thanks to @ondfa * switch from TravisCI to CircleCI --- .circleci/config.yml | 57 ++++++++++++++++++ .travis.yml | 34 ----------- README.md | 2 +- test-requirements.txt | 1 + udapi/core/basereader.py | 20 +++++++ udapi/core/coref.py | 10 ++-- udapi/core/document.py | 3 + udapi/core/tests/__init__.py | 0 .../tests/data/fr-democrat-dev-sample.conllu | 60 +++++++++++++++++++ udapi/core/tests/test_coref.py | 23 +++++++ 10 files changed, 170 insertions(+), 40 deletions(-) create mode 100644 .circleci/config.yml delete mode 100644 .travis.yml create mode 100644 test-requirements.txt create mode 100644 udapi/core/tests/__init__.py create mode 100644 udapi/core/tests/data/fr-democrat-dev-sample.conllu create mode 100755 udapi/core/tests/test_coref.py diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 00000000..7be539d2 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,57 @@ +# Use the latest 2.1 version of CircleCI pipeline process engine. +# See: https://circleci.com/docs/2.0/configuration-reference +version: 2.1 + +# Orbs are reusable packages of CircleCI configuration that you may share across projects, enabling you to create encapsulated, parameterized commands, jobs, and executors that can be used across multiple projects. +# See: https://circleci.com/docs/2.0/orb-intro/ +orbs: + # The python orb contains a set of prepackaged CircleCI configuration you can use repeatedly in your configuration files + # Orb commands and jobs help you with common scripting around a language/tool + # so you dont have to copy and paste it everywhere. + # See the orb documentation here: https://circleci.com/developer/orbs/orb/circleci/python + python: circleci/python@1.5.0 + +# Define a job to be invoked later in a workflow. +# See: https://circleci.com/docs/2.0/configuration-reference/#jobs +jobs: + build-and-test: # This is the name of the job, feel free to change it to better match what you're trying to do! + # These next lines defines a Docker executors: https://circleci.com/docs/2.0/executor-types/ + # You can specify an image from Dockerhub or use one of the convenience images from CircleCI's Developer Hub + # A list of available CircleCI Docker convenience images are available here: https://circleci.com/developer/images/image/cimg/python + # The executor is the environment in which the steps below will be executed - below will use a python 3.10.2 container + # Change the version below to your required version of python + docker: + - image: cimg/python:3.9 + # Checkout the code as the first step. This is a dedicated CircleCI step. + # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default. + # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt. + # Then run your tests! + # CircleCI will report the results back to your VCS provider. + steps: + - checkout + - python/install-packages: + pkg-manager: pip + # app-dir: ~/project/package-directory/ # If you're requirements.txt isn't in the root directory. + # pip-dependency-file: test-requirements.txt # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements. + - run: + name: Install Udapi + command: pip install --use-feature=in-tree-build ".[test]" + - run: + name: Run pytest tests + # This assumes pytest is installed via the install-package step above + command: pytest + - run: + name: Color TextModeTrees + command: udapy read.Conllu files=udapi/core/tests/data/babinsky.conllu write.TextModeTrees color=1 + - run: + name: External tests + command: cd udapi/core/tests && ./external_tests.sh + + +# Invoke jobs via workflows +# See: https://circleci.com/docs/2.0/configuration-reference/#workflows +workflows: + sample: # This is the name of the workflow, feel free to change it to better match your workflow. + # Inside the workflow, you define the jobs you want to run. + jobs: + - build-and-test diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 417e39fb..00000000 --- a/.travis.yml +++ /dev/null @@ -1,34 +0,0 @@ -language: python -python: - - "3.6" - - "3.7" - - "3.8" - - "3.9" -#before_install: -# - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test -# - sudo apt-get update -qq -# - sudo apt-get install -qq gcc-4.8 g++-4.8 -# - CC=g++-4.8 pip install ufal.udpipe -#install: -# - python setup.py install -install: - - pip3 install ".[test]" -script: - - python -m pytest - - udapy read.Conllu files=udapi/core/tests/data/babinsky.conllu write.TextModeTrees color=1 - - cd udapi/core/tests && ./external_tests.sh -jobs: - include: - - name: "Python 3.9 on Windows" - os: windows - language: shell - before_install: - - choco install python - - python --version - - python -m pip install --upgrade pip - - pip3 install --upgrade pytest - env: PATH=/c/Python39:/c/Python39/Scripts:$PATH - script: - - python -c 'import colorama;print("\033[31m some red text")' - - python -Xutf8 -c 'import udapi;udapi.Document("udapi/core/tests/data/babinsky.conllu").draw(color=1)' - - python -m pytest diff --git a/README.md b/README.md index 3bf52eec..11b689dc 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # udapi-python Python framework for processing Universal Dependencies data -[![Build Status](https://travis-ci.org/udapi/udapi-python.svg?branch=master)](https://travis-ci.org/udapi/udapi-python) +[![Build Status](https://circleci.com/gh/udapi/udapi-python.svg?style=svg)](https://circleci.com/gh/udapi/udapi-python) [![Website](https://img.shields.io/website-up-down-green-red/http/udapi.github.io.svg)](http://udapi.github.io) [![Documentation Status](https://readthedocs.org/projects/udapi/badge/)](http://udapi.readthedocs.io) diff --git a/test-requirements.txt b/test-requirements.txt new file mode 100644 index 00000000..e079f8a6 --- /dev/null +++ b/test-requirements.txt @@ -0,0 +1 @@ +pytest diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index fee9da4c..9210b910 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -28,6 +28,12 @@ def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, e logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id + # `global.Entity` is a header stored in a comment before the first tree of each document in a given CoNLL-U file. + # In Udapi, it is stored in `document.meta['global.Entity']`, but for technical reasons, we need to temporarily store it in here, the reader. + # The reason is that `read.Conllu` uses a fast loading interface with `read_trees()`, + # which reads all the trees in a file at once, but it does not have access to the document instance, + # it just returns a sequence of trees (which may be split into multiple documents if `bundles_per_doc` is set). + # So `read.Conllu` cannot store the `global.Entity` in `document.meta['global.Entity']` where it belongs. self._global_entity = None @staticmethod @@ -170,6 +176,7 @@ def process_document(self, document): bundle.add_tree(root) if root.newdoc and root.newdoc is not True: document.meta["docname"] = root.newdoc + document.meta['global.Entity'] = self._global_entity filehandle = self.filehandle if filehandle is None: @@ -259,3 +266,16 @@ def process_document(self, document): if gc_was_enabled: gc.enable() gc.collect() + + def read_documents(self): + """Load all documents of this reader and return them as a list.""" + # udapi.core.document imports udapi.block.read.conllu because of doc.load_conllu(filename) + # and udapi.block.read.conllu loads this module (udapi.core.basereader), + # so we cannot load udapi.core.document at the beginning of this module. + from udapi.core.document import Document + docs = [] + while not self.finished: + doc = Document() + self.process_document(doc) + docs.append(doc) + return docs diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 9e3e97ee..9eedeeb6 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -542,7 +542,7 @@ def load_coref_from_misc(doc, strict=True): # means a start of entity id=7 and start&end (i.e. single-word mention) of entity id=3. # The following re.split line splits this into # chunks = ["(abstract-7-new-2-coref", "(abstract-3-giv:act-1-coref)"] - chunks = [x for x in re.split('(\([^()]+\)?|[^()]+\))', misc_entity) if x] + chunks = [x for x in re.split(r'(\([^()]+\)?|[^()]+\))', misc_entity) if x] for chunk in chunks: opening, closing = (chunk[0] == '(', chunk[-1] == ')') chunk = chunk.strip('()') @@ -752,7 +752,7 @@ def store_coref_to_misc(doc): for idx,subspan in enumerate(subspans, 1): eid = cluster.cluster_id if tree2docid and 'GRP' in fields: - eid = re.sub('^d\d+\.', '', eid) # TODO or "eid = cluster.eid_or_grp"? + eid = re.sub(r'^d\d+\.', '', eid) # TODO or "eid = cluster.eid_or_grp"? subspan_eid = f'{eid}[{idx}/{len(subspans)}]' subspan_words = span_to_nodes(root, subspan) fake_cluster = CorefCluster(subspan_eid, cluster.cluster_type) @@ -771,7 +771,7 @@ def store_coref_to_misc(doc): if field == 'eid' or field == 'GRP': eid = cluster.cluster_id if field == 'GRP': - eid = re.sub('^d\d+\.', '', eid) + eid = re.sub(r'^d\d+\.', '', eid) if any(x in eid for x in CHARS_FORBIDDEN_IN_ID): _error(f"{eid} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]", strict) for c in CHARS_FORBIDDEN_IN_ID: @@ -823,7 +823,7 @@ def store_coref_to_misc(doc): elif '(' not in orig_entity: firstword.misc['Entity'] = mention_str + ')' + orig_entity # (e9)e4)e3) --> (e10)(e9)e4)e3) - elif any(c and c[0] == '(' and c[-1] != ')' for c in re.split('(\([^()]+\)?|[^()]+\))', orig_entity)): + elif any(c and c[0] == '(' and c[-1] != ')' for c in re.split(r'(\([^()]+\)?|[^()]+\))', orig_entity)): firstword.misc['Entity'] += mention_str + ')' # (e1(e2(e9) --> (e1(e2(e9)(e10) # e3)(e1(e2(e9)--> e3)(e1(e2(e9)(e10) @@ -834,7 +834,7 @@ def store_coref_to_misc(doc): firstword.misc['Entity'] += mention_str eid = cluster.cluster_id if tree2docid and 'GRP' in fields: - eid = re.sub('^d\d+\.', '', eid) + eid = re.sub(r'^d\d+\.', '', eid) mention.words[-1].misc['Entity'] = eid + ')' + mention.words[-1].misc['Entity'] # Bridge=e1 Date: Fri, 25 Feb 2022 23:16:53 +0100 Subject: [PATCH 107/871] =?UTF-8?q?'=D1=81=5F=D0=BF=D0=BE=D0=BC=D0=BE?= =?UTF-8?q?=D1=89=D1=8C=D1=8E:gen'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 48febee5..675e9eac 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -11,7 +11,8 @@ class FixEdeprels(Block): # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { - 'как': 'как' # remove morphological case + 'как': 'как', # remove morphological case + 'с_помощь': 'с_помощью:gen' } def process_node(self, node): From e3dbb514d734e96000d10e23a402a6aabd982e8d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 3 Mar 2022 11:13:50 +0100 Subject: [PATCH 108/871] CircleCI icon matching the style of other icons --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 11b689dc..0b41297f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # udapi-python Python framework for processing Universal Dependencies data -[![Build Status](https://circleci.com/gh/udapi/udapi-python.svg?style=svg)](https://circleci.com/gh/udapi/udapi-python) +[![Build Status](https://circleci.com/gh/udapi/udapi-python.svg?style=shield)](https://circleci.com/gh/udapi/udapi-python) [![Website](https://img.shields.io/website-up-down-green-red/http/udapi.github.io.svg)](http://udapi.github.io) [![Documentation Status](https://readthedocs.org/projects/udapi/badge/)](http://udapi.readthedocs.io) From d1db476c8dbd2d52a89c42da0f4175e03baf528b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 3 Mar 2022 15:14:46 +0100 Subject: [PATCH 109/871] fix `entity.create_mention()` and add a test --- udapi/core/coref.py | 2 +- udapi/core/tests/test_coref.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 9eedeeb6..35c66d83 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -330,7 +330,7 @@ def create_mention(self, head=None, mention_words=None, mention_span=None): if head is None: head = mention_words[0] - mention = CorefMention(head, self) + mention = CorefMention(words=[head], head=head, cluster=self) if mention_words: mention.words = mention_words if mention_span: diff --git a/udapi/core/tests/test_coref.py b/udapi/core/tests/test_coref.py index 855a338e..6a77e886 100755 --- a/udapi/core/tests/test_coref.py +++ b/udapi/core/tests/test_coref.py @@ -18,6 +18,31 @@ def test_load(self): self.assertEqual(len(coref_entities), 1) self.assertEqual(coref_entities[0].cluster_id, 'e36781') + def test_edits(self): + data_filename = os.path.join(os.path.dirname(__file__), 'data', 'fr-democrat-dev-sample.conllu') + doc = udapi.Document(data_filename) + first_node = next(doc.nodes) + second_node = first_node.next_node + new_entity = first_node.create_coref_cluster(cluster_type='person') + self.assertEqual(new_entity.cluster_type, 'person') + self.assertEqual(len(new_entity.mentions), 1) + m1 = new_entity.mentions[0] + self.assertEqual(m1.cluster, new_entity) + self.assertEqual(m1.head, first_node) + self.assertEqual(m1.words, [first_node]) + self.assertEqual(m1.span, '1') + m1.words = [second_node, first_node, first_node] # intentional duplicates and wrong order + self.assertEqual(m1.words, [first_node, second_node]) + self.assertEqual(m1.span, '1-2') + m1.head = second_node + self.assertEqual(m1.head, second_node) + m2 = new_entity.create_mention(head=second_node, mention_span='1-3') + self.assertEqual(len(new_entity.mentions), 2) + self.assertEqual(new_entity.mentions[0], m2) # 1-3 should go before 1-2 + self.assertEqual(new_entity.mentions[1], m1) + self.assertTrue(m2 < m1) + self.assertEqual(m2.words, [first_node, second_node, second_node.next_node]) + if __name__ == "__main__": unittest.main() From 1ffb06789175aa5bf9ee546df7ae2624bb42acde Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 3 Mar 2022 19:51:23 +0100 Subject: [PATCH 110/871] add `doc.create_coref_cluster()`, rename params of `cluster.create_mention()` `doc.create_coref_cluster()` allows to create a new cluster without any mentions, which seems to be practical in real use cases. `m = cluster.create_mention(mention_words=[w1, w2])` seems redunant, `m = cluster.create_mention(words=[w1, w2])` looks better. Similarly with `mention_span` -> `span`. It will be consistent with `m.words` and `m.span`. TODO: consider removing `new_cluster = node.create_coref_cluster()` which creates both a cluster and a new mention, but does not return the mention (it can be accessed with `new_cluster.mentions[0]`, of course). So far, I've just remove it from the tests. --- udapi/core/coref.py | 48 ++++++++++++---------------------- udapi/core/document.py | 13 +++++++++ udapi/core/node.py | 8 ++++-- udapi/core/tests/test_coref.py | 8 +++--- 4 files changed, 41 insertions(+), 36 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 35c66d83..eef25dd2 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -302,7 +302,7 @@ def eid_or_grp(self): def mentions(self): return self._mentions - def create_mention(self, head=None, mention_words=None, mention_span=None): + def create_mention(self, head=None, words=None, span=None): """Create a new CoreferenceMention object within this CorefCluster. Args: @@ -310,31 +310,31 @@ def create_mention(self, head=None, mention_words=None, mention_span=None): The head is supposed to be the linguistic head of the mention, i.e. the highest node in the dependency tree, but if such information is not available (yet), - it can be any node within the mention_words. - If no head is specified, the first word from mention_words will be used instead. - mention_words: a list of nodes of the mention. + it can be any node within the `words`. + If no head is specified, the first word from `words` will be used instead. + words: a list of nodes of the mention. This argument is optional, but if provided, it must contain the head. The nodes can be both normal nodes or empty nodes. - mention_span: an alternative way how to specify mention_words + span: an alternative way how to specify `words` using a string such as "3-5,6,7.1-7.2". (which means, there is an empty node 5.1 and normal node 7, which are not part of the mention). - At most one of the args mention_words and mention_span can be specified. + At most one of the args `words` and `span` can be specified. """ - if mention_words and mention_span: - raise ValueError("Cannot specify both mention_words and mention_span") - if head and mention_words and head not in mention_words: - raise ValueError(f"Head {head} is not among the specified mention_words") - if head is None and mention_words is None: - raise ValueError("Either head or mention_words must be specified") + if words and span: + raise ValueError("Cannot specify both words and span") + if head and words and head not in words: + raise ValueError(f"Head {head} is not among the specified words") + if head is None and words is None: + raise ValueError("Either head or words must be specified") if head is None: - head = mention_words[0] + head = words[0] mention = CorefMention(words=[head], head=head, cluster=self) - if mention_words: - mention.words = mention_words - if mention_span: - mention.span = mention_span + if words: + mention.words = words + if span: + mention.span = span self._mentions.sort() return mention @@ -477,20 +477,6 @@ def _delete_targets_without_mentions(self, warn=True): self._data.remove(link) -def create_coref_cluster(head, cluster_id=None, cluster_type=None, **kwargs): - clusters = head.root.bundle.document.coref_clusters - if not cluster_id: - counter = 1 - while clusters.get('c%d' % counter): - counter += 1 - cluster_id = 'c%d' % counter - elif clusters.get(cluster_id): - raise ValueError("Cluster with a id %s already exists", cluster_id) - cluster = CorefCluster(cluster_id, cluster_type) - cluster.create_mention(head, **kwargs) - clusters[cluster_id] = cluster - return cluster - def _error(msg, strict): if strict: raise ValueError(msg) diff --git a/udapi/core/document.py b/udapi/core/document.py index c50f8e43..8e33c8d6 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -136,3 +136,16 @@ def coref_mentions(self): all_mentions.extend(cluster.mentions) all_mentions.sort() return all_mentions + + def create_coref_cluster(self, cluster_id=None, cluster_type=None): + self._load_coref() + if not cluster_id: + counter = 1 + while self._coref_clusters.get(f'c{counter}'): + counter += 1 + cluster_id = f'c{counter}' + elif clusters.get(cluster_id): + raise ValueError("Cluster with a id %s already exists", cluster_id) + cluster = udapi.core.coref.CorefCluster(cluster_id, cluster_type) + self._coref_clusters[cluster_id] = cluster + return cluster diff --git a/udapi/core/node.py b/udapi/core/node.py index 3d120a52..4524c119 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -921,8 +921,12 @@ def coref_clusters(self): self._root.bundle.document._load_coref() return [m.cluster for m in self._mentions if m.cluster is not None] - def create_coref_cluster(self, **kwargs): - return udapi.core.coref.create_coref_cluster(head=self, **kwargs) + # TODO: is this method useful? + def create_coref_cluster(self, cluster_id=None, cluster_type=None, **kwargs): + doc = self._root.bundle.document + cluster = doc.create_coref_cluster(cluster_id, cluster_type) + cluster.create_mention(head=self, **kwargs) + return cluster class CycleError(Exception): diff --git a/udapi/core/tests/test_coref.py b/udapi/core/tests/test_coref.py index 6a77e886..8eab1436 100755 --- a/udapi/core/tests/test_coref.py +++ b/udapi/core/tests/test_coref.py @@ -23,10 +23,12 @@ def test_edits(self): doc = udapi.Document(data_filename) first_node = next(doc.nodes) second_node = first_node.next_node - new_entity = first_node.create_coref_cluster(cluster_type='person') + new_entity = doc.create_coref_cluster(cluster_type='person') self.assertEqual(new_entity.cluster_type, 'person') + self.assertEqual(len(new_entity.mentions), 0) + m1 = new_entity.create_mention(words=[first_node]) # head will be automatically set to words[0] self.assertEqual(len(new_entity.mentions), 1) - m1 = new_entity.mentions[0] + self.assertEqual(m1, new_entity.mentions[0]) self.assertEqual(m1.cluster, new_entity) self.assertEqual(m1.head, first_node) self.assertEqual(m1.words, [first_node]) @@ -36,7 +38,7 @@ def test_edits(self): self.assertEqual(m1.span, '1-2') m1.head = second_node self.assertEqual(m1.head, second_node) - m2 = new_entity.create_mention(head=second_node, mention_span='1-3') + m2 = new_entity.create_mention(head=second_node, span='1-3') # mention.words will be filled according to the span self.assertEqual(len(new_entity.mentions), 2) self.assertEqual(new_entity.mentions[0], m2) # 1-3 should go before 1-2 self.assertEqual(new_entity.mentions[1], m1) From d025550b1102fa0fa2546e9d0e8cc9de0165498a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 9 Mar 2022 09:48:16 +0100 Subject: [PATCH 111/871] Some more Russian prepositions. --- udapi/block/ud/ru/fixedeprels.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 675e9eac..440cfd0b 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -11,8 +11,12 @@ class FixEdeprels(Block): # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { + 'в_качество': 'в_качестве:gen', + 'в_течение': 'в_течение:gen', 'как': 'как', # remove morphological case - 'с_помощь': 'с_помощью:gen' + 'несмотря_на': 'несмотря_на:acc', + 'с_помощь': 'с_помощью:gen', + 'чем': 'чем' # remove morphological case } def process_node(self, node): @@ -38,7 +42,7 @@ def process_node(self, node): # available. Thanks to the Case feature on prepositions, we can # identify the correct one. if not solved: - m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod):(на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) if m: # The following is only partial solution. We will not see # some children because they may be shared children of coordination. @@ -46,6 +50,9 @@ def process_node(self, node): if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() solved = True + else: + # Accusative or locative are possible. Pick locative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' if re.match(r'^(acl|advcl):', edep['deprel']): edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) From c6893e678e637121b5413b5322e41eb6fca868e7 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 10 Mar 2022 17:25:34 +0100 Subject: [PATCH 112/871] Russian prepositions with morphological case. --- udapi/block/ud/ru/fixedeprels.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 440cfd0b..e96fd8d1 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -42,7 +42,9 @@ def process_node(self, node): # available. Thanks to the Case feature on prepositions, we can # identify the correct one. if not solved: - m = re.match(r'^(obl(?::arg)?|nmod):(на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + ###!!! Both "на" and "в" seem to also occur with genitive. + ###!!! I don't think it is valid but let's see some examples before we ban it. + m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|dat|voc))?$', edep['deprel']) if m: # The following is only partial solution. We will not see # some children because they may be shared children of coordination. From 51cd1b134bac365139ca0841d01aebd22794b247 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 11 Mar 2022 11:06:32 +0100 Subject: [PATCH 113/871] Russian prepositions with morphological case. --- udapi/block/ud/ru/fixedeprels.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index e96fd8d1..d9c539d5 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -13,6 +13,7 @@ class FixEdeprels(Block): unambiguous = { 'в_качество': 'в_качестве:gen', 'в_течение': 'в_течение:gen', + 'в_ход': 'в_ходе:gen', 'как': 'как', # remove morphological case 'несмотря_на': 'несмотря_на:acc', 'с_помощь': 'с_помощью:gen', @@ -42,9 +43,10 @@ def process_node(self, node): # available. Thanks to the Case feature on prepositions, we can # identify the correct one. if not solved: - ###!!! Both "на" and "в" seem to also occur with genitive. - ###!!! I don't think it is valid but let's see some examples before we ban it. - m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|dat|voc))?$', edep['deprel']) + # Both "на" and "в" also occur with genitive. However, this + # is only because there are numerals in the phrase ("в 9 случаев из 10") + # and the whole phrase should not be analyzed as genitive. + m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) if m: # The following is only partial solution. We will not see # some children because they may be shared children of coordination. From 71899a4bfddeafe870d4a18c403042c2f980c678 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 12 Mar 2022 13:42:27 +0100 Subject: [PATCH 114/871] Russian prepositions with morphological cases. --- udapi/block/ud/ru/fixedeprels.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index d9c539d5..cc21ec33 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -12,10 +12,13 @@ class FixEdeprels(Block): # case, even if they are not secondary. unambiguous = { 'в_качество': 'в_качестве:gen', + 'в_связь_с': 'в_связи_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', 'как': 'как', # remove morphological case 'несмотря_на': 'несмотря_на:acc', + 'помимо': 'помимо:gen', + 'со_сторона': 'со_стороны:gen', 'с_помощь': 'с_помощью:gen', 'чем': 'чем' # remove morphological case } From 094a2b982cc7575ab7b3c5d22d103b08e6343ece Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 13 Mar 2022 11:05:26 +0100 Subject: [PATCH 115/871] Russian prepositions. --- udapi/block/ud/ru/fixedeprels.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index cc21ec33..eb8292ae 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -15,11 +15,13 @@ class FixEdeprels(Block): 'в_связь_с': 'в_связи_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', + 'до': 'до:gen', 'как': 'как', # remove morphological case 'несмотря_на': 'несмотря_на:acc', 'помимо': 'помимо:gen', 'со_сторона': 'со_стороны:gen', 'с_помощь': 'с_помощью:gen', + 'так_что': 'так_что', # remove morphological case 'чем': 'чем' # remove morphological case } From 3b6748be36653e917420d44a6c8d7d8a65782aa0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Mar 2022 15:04:17 +0100 Subject: [PATCH 116/871] =?UTF-8?q?Rusk=C3=A9=20p=C5=99edlo=C5=BEky.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index eb8292ae..c4906053 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -12,15 +12,19 @@ class FixEdeprels(Block): # case, even if they are not secondary. unambiguous = { 'в_качество': 'в_качестве:gen', + 'в_отношение': 'в_отношении:gen', 'в_связь_с': 'в_связи_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', + 'ведь': 'ведь', # remove morphological case 'до': 'до:gen', 'как': 'как', # remove morphological case 'несмотря_на': 'несмотря_на:acc', 'помимо': 'помимо:gen', + 'согласно': 'согласно:dat', 'со_сторона': 'со_стороны:gen', 'с_помощь': 'с_помощью:gen', + 'спустя': 'спустя:acc', 'так_что': 'так_что', # remove morphological case 'чем': 'чем' # remove morphological case } From c513f676363cf0b901af67f1840f184f7361a9b7 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Mar 2022 16:15:41 +0100 Subject: [PATCH 117/871] =?UTF-8?q?=D0=B7=D0=B0:gen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index c4906053..7a5a0dc1 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -66,6 +66,20 @@ def process_node(self, node): else: # Accusative or locative are possible. Pick locative. edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' + # Both "за" and "" also occur with instrumental. However, this + # is only because there are numerals in the phrase ("за последние 20 лет") + # and the whole phrase should be usually analyzed as accusative. + m = re.match(r'^(obl(?::arg)?|nmod):(за)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True + else: + # Accusative or instrumental are possible. Pick accusative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' if re.match(r'^(acl|advcl):', edep['deprel']): edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) From 5f7d17f6db8263c062b5f8902c35e7a3baa7ce9f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 12:46:30 +0100 Subject: [PATCH 118/871] =?UTF-8?q?=D1=81=D0=BB=D0=BE=D0=B2=D0=BD=D0=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 82 ++++++++++++++++---------------- 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 7a5a0dc1..5aaf6308 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -21,9 +21,10 @@ class FixEdeprels(Block): 'как': 'как', # remove morphological case 'несмотря_на': 'несмотря_на:acc', 'помимо': 'помимо:gen', - 'согласно': 'согласно:dat', - 'со_сторона': 'со_стороны:gen', 'с_помощь': 'с_помощью:gen', + 'словно': 'словно', # remove morphological case + 'со_сторона': 'со_стороны:gen', + 'согласно': 'согласно:dat', 'спустя': 'спустя:acc', 'так_что': 'так_что', # remove morphological case 'чем': 'чем' # remove morphological case @@ -48,38 +49,46 @@ def process_node(self, node): edep['deprel'] = m.group(1)+':'+self.unambiguous[x] solved = True break + if solved: + break + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'словно_у' becomes just 'словно'. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):словно([_:].+)?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':словно' + break # The following prepositions have more than one morphological case # available. Thanks to the Case feature on prepositions, we can # identify the correct one. - if not solved: - # Both "на" and "в" also occur with genitive. However, this - # is only because there are numerals in the phrase ("в 9 случаев из 10") - # and the whole phrase should not be analyzed as genitive. - m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) - if m: - # The following is only partial solution. We will not see - # some children because they may be shared children of coordination. - prepchildren = [x for x in node.children if x.lemma == m.group(2)] - if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': - edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() - solved = True - else: - # Accusative or locative are possible. Pick locative. - edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' - # Both "за" and "" also occur with instrumental. However, this - # is only because there are numerals in the phrase ("за последние 20 лет") - # and the whole phrase should be usually analyzed as accusative. - m = re.match(r'^(obl(?::arg)?|nmod):(за)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) - if m: - # The following is only partial solution. We will not see - # some children because they may be shared children of coordination. - prepchildren = [x for x in node.children if x.lemma == m.group(2)] - if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': - edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() - solved = True - else: - # Accusative or instrumental are possible. Pick accusative. - edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' + # Both "на" and "в" also occur with genitive. However, this + # is only because there are numerals in the phrase ("в 9 случаев из 10") + # and the whole phrase should not be analyzed as genitive. + m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True + else: + # Accusative or locative are possible. Pick locative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' + # Both "за" and "" also occur with instrumental. However, this + # is only because there are numerals in the phrase ("за последние 20 лет") + # and the whole phrase should be usually analyzed as accusative. + m = re.match(r'^(obl(?::arg)?|nmod):(за)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True + else: + # Accusative or instrumental are possible. Pick accusative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' if re.match(r'^(acl|advcl):', edep['deprel']): edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) @@ -94,17 +103,6 @@ def process_node(self, node): edep['deprel'] = 'nmod:nom' elif edep['deprel'] == 'nmod:voc': edep['deprel'] = 'nmod:nom' - else: - # If one of the following expressions occurs followed by another preposition, - # remove the additional preposition. For example, 'i_když_s' becomes just 'i_když'. - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' def set_basic_and_enhanced(self, node, parent, deprel, edeprel): ''' From 0cc70536aff35dc62fa3a22a904b6a3016caa3ae Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 15:20:05 +0100 Subject: [PATCH 119/871] =?UTF-8?q?=D0=BF=D1=80=D0=B8=5F=D0=BF=D0=BE=D0=BC?= =?UTF-8?q?=D0=BE=D1=89=D0=B8:gen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 5aaf6308..74b919fb 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -20,7 +20,9 @@ class FixEdeprels(Block): 'до': 'до:gen', 'как': 'как', # remove morphological case 'несмотря_на': 'несмотря_на:acc', + 'по_повод': 'по_поводу:gen', 'помимо': 'помимо:gen', + 'при_помощь': 'при_помощи:gen', 'с_помощь': 'с_помощью:gen', 'словно': 'словно', # remove morphological case 'со_сторона': 'со_стороны:gen', From dc0cd7d432af92e86c476d44c46e510a07a5aae5 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 16:40:00 +0100 Subject: [PATCH 120/871] More systematic processing of outermost case markers. --- udapi/block/ud/ru/fixedeprels.py | 39 +++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 74b919fb..c357fa49 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -5,31 +5,40 @@ class FixEdeprels(Block): + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'как' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('как_в:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + outermost = [ + 'ведь', + 'как', + 'словно', + 'так_что', + 'чем' + ] + # Secondary prepositions sometimes have the lemma of the original part of # speech. We want the grammaticalized form instead. List even those that # will have the same lexical form, as we also want to check the morphological # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { + 'в_вид': 'в_виде:gen', 'в_качество': 'в_качестве:gen', 'в_отношение': 'в_отношении:gen', 'в_связь_с': 'в_связи_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', - 'ведь': 'ведь', # remove morphological case 'до': 'до:gen', - 'как': 'как', # remove morphological case 'несмотря_на': 'несмотря_на:acc', 'по_повод': 'по_поводу:gen', 'помимо': 'помимо:gen', 'при_помощь': 'при_помощи:gen', 'с_помощь': 'с_помощью:gen', - 'словно': 'словно', # remove morphological case 'со_сторона': 'со_стороны:gen', 'согласно': 'согласно:dat', - 'спустя': 'спустя:acc', - 'так_что': 'так_что', # remove morphological case - 'чем': 'чем' # remove morphological case + 'спустя': 'спустя:acc' } def process_node(self, node): @@ -43,6 +52,17 @@ def process_node(self, node): if m: bdeprel = m.group(1) solved = False + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'словно_у' becomes just 'словно'. + for x in self.outermost: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+x + solved = True + break + if solved: + break for x in self.unambiguous: # All secondary prepositions have only one fixed morphological case # they appear with, so we can replace whatever case we encounter with the correct one. @@ -53,13 +73,6 @@ def process_node(self, node): break if solved: break - # If one of the following expressions occurs followed by another preposition - # or by morphological case, remove the additional case marking. For example, - # 'словно_у' becomes just 'словно'. - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):словно([_:].+)?$', edep['deprel']) - if m: - edep['deprel'] = m.group(1)+':словно' - break # The following prepositions have more than one morphological case # available. Thanks to the Case feature on prepositions, we can # identify the correct one. From b3e97c3b107e66d6f3c62549e3e000b8cfa2890e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 17:51:37 +0100 Subject: [PATCH 121/871] It is now possible to define exceptions to the rule. --- udapi/block/ud/ru/fixedeprels.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index c357fa49..6a1b001e 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -10,13 +10,14 @@ class FixEdeprels(Block): # is used with the same case (preposition + morphology) as the nominal that # is being compared ('как_в:loc' etc.) We do not want to multiply the relations # by all the inner cases. - outermost = [ - 'ведь', - 'как', - 'словно', - 'так_что', - 'чем' - ] + # The list in the value contains exceptions that should be left intact. + outermost = { + 'ведь': [], + 'как': ['как_только'], + 'словно': [], + 'так_что': [], + 'чем': [] + } # Secondary prepositions sometimes have the lemma of the original part of # speech. We want the grammaticalized form instead. List even those that @@ -56,8 +57,9 @@ def process_node(self, node): # or by morphological case, remove the additional case marking. For example, # 'словно_у' becomes just 'словно'. for x in self.outermost: + exceptions = self.outermost[x] m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) - if m: + if m and not x+m.group(2) in exceptions: edep['deprel'] = m.group(1)+':'+x solved = True break From 052ab6c68235cb15fcc4c7f69718bb03e290a7ab Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 17:54:37 +0100 Subject: [PATCH 122/871] Fix? --- udapi/block/ud/ru/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 6a1b001e..ab706346 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -59,7 +59,7 @@ def process_node(self, node): for x in self.outermost: exceptions = self.outermost[x] m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) - if m and not x+m.group(2) in exceptions: + if m and m.group(2) and not x+m.group(2) in exceptions: edep['deprel'] = m.group(1)+':'+x solved = True break From 61845025b063ce51a96384a59b8caaf4c67fd26f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 18:16:25 +0100 Subject: [PATCH 123/871] =?UTF-8?q?=D0=BA:dat?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index ab706346..43ad5f3a 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -12,6 +12,7 @@ class FixEdeprels(Block): # by all the inner cases. # The list in the value contains exceptions that should be left intact. outermost = { + 'будто': [], 'ведь': [], 'как': ['как_только'], 'словно': [], @@ -32,6 +33,7 @@ class FixEdeprels(Block): 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', 'до': 'до:gen', + 'к': 'к:dat', 'несмотря_на': 'несмотря_на:acc', 'по_повод': 'по_поводу:gen', 'помимо': 'помимо:gen', @@ -106,11 +108,7 @@ def process_node(self, node): else: # Accusative or instrumental are possible. Pick accusative. edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' - if re.match(r'^(acl|advcl):', edep['deprel']): - edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) - elif re.match(r'^(nmod|obl):', edep['deprel']): + if re.match(r'^(nmod|obl):', edep['deprel']): if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': # This is a same-case noun-noun modifier, which just happens to be in the locative. # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has From 9c003aa06b289a21db51d612e4298b2d64632773 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 18:56:34 +0100 Subject: [PATCH 124/871] Fix advcl:(od|do):gen. --- udapi/block/ud/cs/fixedeprels.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index b3e551e5..feed707d 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -256,6 +256,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:do:gen$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! edep['deprel'] = re.sub(r'^(acl):k:dat$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:k:dat$', r'obl:k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) @@ -264,6 +265,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^acl:od:gen$', r'nmod:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:od:gen$', r'obl:od:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! edep['deprel'] = re.sub(r'^advcl:podle:gen$', r'obl:podle:gen', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:pro:acc$', r'obl:pro:acc', edep['deprel']) edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel']) From f66340342ee7443e85956d35e4302164fa154c9b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 20:57:56 +0100 Subject: [PATCH 125/871] Minor fixes in Czech. --- udapi/block/ud/cs/fixedeprels.py | 73 ++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index feed707d..871939a8 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -5,6 +5,27 @@ class FixEdeprels(Block): + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'jako' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'ač': [], + 'ačkoli': [], # 'ačkoliv' se převede na 'ačkoli' dole + 'byť': [], + 'i_když': [], + 'jak': [], + 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole + 'jako': [], + 'jakoby': [], # these instances in FicTree should be spelled 'jako by' + 'než': [], + 'protože': [], + 'takže': [], + 'třebaže': [] + } + # Secondary prepositions sometimes have the lemma of the original part of # speech. We want the grammaticalized form instead. List even those that # will have the same lexical form, as we also want to check the morphological @@ -230,6 +251,21 @@ def process_node(self, node): if m: bdeprel = m.group(1) solved = False + # Removing 'až' must be done early. The remainder may be 'počátek' + # and we will want to convert it to 'počátkem:gen'. + edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'jako_v' becomes just 'jako'. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) + if m and m.group(2) and not x+m.group(2) in exceptions: + edep['deprel'] = m.group(1)+':'+x + solved = True + break + if solved: + break for x in self.unambiguous: # All secondary prepositions have only one fixed morphological case # they appear with, so we can replace whatever case we encounter with the correct one. @@ -238,18 +274,19 @@ def process_node(self, node): edep['deprel'] = m.group(1)+':'+self.unambiguous[x] solved = True break + if solved: + break # The following prepositions have more than one morphological case # available. Thanks to the Case feature on prepositions, we can # identify the correct one. - if not solved: - m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) - if m: - # The following is only partial solution. We will not see - # some children because they may be shared children of coordination. - prepchildren = [x for x in node.children if x.lemma == m.group(2)] - if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': - edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() - solved = True + m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True if re.match(r'^(acl|advcl):', edep['deprel']): # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) @@ -261,7 +298,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^advcl:k:dat$', r'obl:k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' - edep['deprel'] = re.sub(r'^acl:na_způsob:gen$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' + edep['deprel'] = re.sub(r'^acl:na_způsob(?::gen)?$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^acl:od:gen$', r'nmod:od:gen', edep['deprel']) @@ -427,26 +464,10 @@ def process_node(self, node): # Instrumental would be possible but unlikely. edep['deprel'] += ':acc' else: - # If one of the following expressions occurs followed by another preposition, - # remove the additional preposition. For example, 'i_když_s' becomes just 'i_když'. - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):než[_:].+$', r'\1:než', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):takže[_:].+$', r'\1:takže', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) - # edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_?l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto From 5adadf475c66727f67b6bc866e7f9f18a8ef4984 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 21:41:29 +0100 Subject: [PATCH 126/871] Bug fix. --- udapi/block/ud/cs/fixedeprels.py | 5 ++--- udapi/block/ud/ru/fixedeprels.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 871939a8..019dd35b 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -265,7 +265,7 @@ def process_node(self, node): solved = True break if solved: - break + continue for x in self.unambiguous: # All secondary prepositions have only one fixed morphological case # they appear with, so we can replace whatever case we encounter with the correct one. @@ -275,7 +275,7 @@ def process_node(self, node): solved = True break if solved: - break + continue # The following prepositions have more than one morphological case # available. Thanks to the Case feature on prepositions, we can # identify the correct one. @@ -319,7 +319,6 @@ def process_node(self, node): node.feats['Tense'] = '' node.feats['VerbForm'] = '' node.feats['Voice'] = '' - edep['deprel'] = re.sub(r'^advcl:(od|do)$', r'obl:\1:gen', edep['deprel']) elif re.match(r'^(nmod|obl):', edep['deprel']): if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': # This is a same-case noun-noun modifier, which just happens to be in the locative. diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 43ad5f3a..fba30571 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -66,7 +66,7 @@ def process_node(self, node): solved = True break if solved: - break + continue for x in self.unambiguous: # All secondary prepositions have only one fixed morphological case # they appear with, so we can replace whatever case we encounter with the correct one. @@ -76,7 +76,7 @@ def process_node(self, node): solved = True break if solved: - break + continue # The following prepositions have more than one morphological case # available. Thanks to the Case feature on prepositions, we can # identify the correct one. From 75d5ea22d23fdf2cadb16991a0b44ee83e5fddc3 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 22:03:40 +0100 Subject: [PATCH 127/871] advcl:k:dat --- udapi/block/ud/cs/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 019dd35b..53337763 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -295,7 +295,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:do:gen$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! edep['deprel'] = re.sub(r'^(acl):k:dat$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:k:dat$', r'obl:k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:k(?::dat)?$', r'obl:k:dat', edep['deprel']) ###!!! Ale měli bychom opravit i závislost v základním stromu! edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' edep['deprel'] = re.sub(r'^acl:na_způsob(?::gen)?$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' From fea338fec4872aa320dadb29313bb8d9392f6477 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 22:31:41 +0100 Subject: [PATCH 128/871] Another attempt to fix all Czech edeprels. --- udapi/block/ud/cs/fixedeprels.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 53337763..5be99867 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -251,6 +251,22 @@ def process_node(self, node): if m: bdeprel = m.group(1) solved = False + # Issues caused by errors in the original annotation must be fixed early. + # Especially if acl|advcl occurs with a preposition that unambiguously + # receives a morphological case in the subsequent steps, and then gets + # flagged as solved. + edep['deprel'] = re.sub(r'^advcl:do(?::gen)?$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + edep['deprel'] = re.sub(r'^acl:k(?::dat)?$', r'acl', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:k(?::dat)?$', r'obl:k:dat', edep['deprel']) ###!!! Ale měli bychom opravit i závislost v základním stromu! + edep['deprel'] = re.sub(r'^advcl:místo(?::gen)?$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' + edep['deprel'] = re.sub(r'^acl:na_způsob(?::gen)?$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' + edep['deprel'] = re.sub(r'^acl:od(?::gen)?$', r'nmod:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:od(?::gen)?$', r'obl:od:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + edep['deprel'] = re.sub(r'^advcl:podle(?::gen)?$', r'obl:podle:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:pro(?::acc)?$', r'obl:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v_duchu(?::gen)?$', r'obl:v_duchu:gen', edep['deprel']) # Removing 'až' must be done early. The remainder may be 'počátek' # and we will want to convert it to 'počátkem:gen'. edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) @@ -293,21 +309,9 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:do:gen$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! - edep['deprel'] = re.sub(r'^(acl):k:dat$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:k(?::dat)?$', r'obl:k:dat', edep['deprel']) ###!!! Ale měli bychom opravit i závislost v základním stromu! edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' - edep['deprel'] = re.sub(r'^acl:na_způsob(?::gen)?$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^acl:od:gen$', r'nmod:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:od:gen$', r'obl:od:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! - edep['deprel'] = re.sub(r'^advcl:podle:gen$', r'obl:podle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:pro:acc$', r'obl:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v_duchu:gen$', r'obl:v_duchu:gen', edep['deprel']) if edep['deprel'] == 'acl:v' and node.form == 'patře': edep['deprel'] = 'nmod:v:loc' node.deprel = 'nmod' From f2c64f8083dc278ba338115625d6d422a954f1e8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 23:18:35 +0100 Subject: [PATCH 129/871] =?UTF-8?q?59=20obl:=D0=B2=5F=D1=81=D0=BE=D0=BE?= =?UTF-8?q?=D1=82=D0=B2=D0=B5=D1=82=D1=81=D1=82=D0=B2=D0=B8=D0=B5=5F=D1=81?= =?UTF-8?q?:ins=20=20=20=20=20=2058=20obl:=D1=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 46 +++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index fba30571..91046131 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -30,6 +30,7 @@ class FixEdeprels(Block): 'в_качество': 'в_качестве:gen', 'в_отношение': 'в_отношении:gen', 'в_связь_с': 'в_связи_с:ins', + 'в_соответствие_с': 'в_соответствии_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', 'до': 'до:gen', @@ -44,6 +45,19 @@ class FixEdeprels(Block): 'спустя': 'спустя:acc' } + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + def process_node(self, node): """ Occasionally the edeprels automatically derived from the Russian basic @@ -85,29 +99,31 @@ def process_node(self, node): # and the whole phrase should not be analyzed as genitive. m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) if m: - # The following is only partial solution. We will not see - # some children because they may be shared children of coordination. - prepchildren = [x for x in node.children if x.lemma == m.group(2)] - if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': - edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() - solved = True + adpcase = copy_case_from_adposition(self, node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase else: # Accusative or locative are possible. Pick locative. edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' - # Both "за" and "" also occur with instrumental. However, this - # is only because there are numerals in the phrase ("за последние 20 лет") - # and the whole phrase should be usually analyzed as accusative. + continue m = re.match(r'^(obl(?::arg)?|nmod):(за)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) if m: - # The following is only partial solution. We will not see - # some children because they may be shared children of coordination. - prepchildren = [x for x in node.children if x.lemma == m.group(2)] - if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': - edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() - solved = True + adpcase = copy_case_from_adposition(self, node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase else: # Accusative or instrumental are possible. Pick accusative. edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' + continue + m = re.match(r'^(obl(?::arg)?|nmod):(с)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel']) + if m: + adpcase = copy_case_from_adposition(self, node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or instrumental are possible. Pick instrumental. + edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' + continue if re.match(r'^(nmod|obl):', edep['deprel']): if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': # This is a same-case noun-noun modifier, which just happens to be in the locative. From 8a998468cf7524e267d95b5e987a06477bc14436 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 23:21:28 +0100 Subject: [PATCH 130/871] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 91046131..96831746 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -99,7 +99,7 @@ def process_node(self, node): # and the whole phrase should not be analyzed as genitive. m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) if m: - adpcase = copy_case_from_adposition(self, node, m.group(2)) + adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: edep['deprel'] = m.group(1)+':'+adpcase else: @@ -108,7 +108,7 @@ def process_node(self, node): continue m = re.match(r'^(obl(?::arg)?|nmod):(за)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) if m: - adpcase = copy_case_from_adposition(self, node, m.group(2)) + adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: edep['deprel'] = m.group(1)+':'+adpcase else: @@ -117,7 +117,7 @@ def process_node(self, node): continue m = re.match(r'^(obl(?::arg)?|nmod):(с)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel']) if m: - adpcase = copy_case_from_adposition(self, node, m.group(2)) + adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: edep['deprel'] = m.group(1)+':'+adpcase else: From 1131de3720d176eac151afd4139ae822088542f7 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 26 Mar 2022 10:46:02 +0100 Subject: [PATCH 131/871] Bug fix. --- udapi/block/ud/cs/fixedeprels.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 5be99867..57e2bfb0 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -19,7 +19,7 @@ class FixEdeprels(Block): 'jak': [], 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole 'jako': [], - 'jakoby': [], # these instances in FicTree should be spelled 'jako by' + 'jakoby': ['jakoby_pod'], # these instances in FicTree should be spelled 'jako by' 'než': [], 'protože': [], 'takže': [], @@ -33,7 +33,7 @@ class FixEdeprels(Block): # case, even if they are not secondary. unambiguous = { 'abi': 'aby', - 'aby_na': 'na', + 'aby_na': 'na:loc', 'ačkoliv': 'ačkoli', 'ať': 'ať', # remove morphological case 'ať_forma': 'formou:gen', @@ -240,6 +240,19 @@ class FixEdeprels(Block): 'že_za': 'za:gen' } + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + def process_node(self, node): """ Occasionally the edeprels automatically derived from the Czech basic @@ -266,7 +279,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^advcl:pro(?::acc)?$', r'obl:pro:acc', edep['deprel']) edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v_duchu(?::gen)?$', r'obl:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v_duchu?(?::gen)?$', r'obl:v_duchu:gen', edep['deprel']) # Removing 'až' must be done early. The remainder may be 'počátek' # and we will want to convert it to 'počátkem:gen'. edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) @@ -297,12 +310,10 @@ def process_node(self, node): # identify the correct one. m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) if m: - # The following is only partial solution. We will not see - # some children because they may be shared children of coordination. - prepchildren = [x for x in node.children if x.lemma == m.group(2)] - if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': - edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() - solved = True + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + continue if re.match(r'^(acl|advcl):', edep['deprel']): # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) @@ -323,7 +334,7 @@ def process_node(self, node): node.feats['Tense'] = '' node.feats['VerbForm'] = '' node.feats['Voice'] = '' - elif re.match(r'^(nmod|obl):', edep['deprel']): + elif re.match(r'^(nmod|obl(:arg)?):', edep['deprel']): if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': # This is a same-case noun-noun modifier, which just happens to be in the locative. # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has From 8d4a6e8800dc593a0d30558f794f650ff88b84e8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 26 Mar 2022 11:46:58 +0100 Subject: [PATCH 132/871] More fixes to edeprels. --- udapi/block/ud/cs/fixedeprels.py | 2 +- udapi/block/ud/ru/fixedeprels.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 57e2bfb0..7a49bb87 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -19,7 +19,7 @@ class FixEdeprels(Block): 'jak': [], 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole 'jako': [], - 'jakoby': ['jakoby_pod'], # these instances in FicTree should be spelled 'jako by' + 'jakoby': ['jakoby_pod:ins'], # these instances in FicTree should be spelled 'jako by' 'než': [], 'protože': [], 'takže': [], diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 96831746..a5560121 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -15,6 +15,7 @@ class FixEdeprels(Block): 'будто': [], 'ведь': [], 'как': ['как_только'], + 'раз': [], 'словно': [], 'так_что': [], 'чем': [] @@ -33,10 +34,15 @@ class FixEdeprels(Block): 'в_соответствие_с': 'в_соответствии_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', + 'возле': 'возле:gen', 'до': 'до:gen', 'к': 'к:dat', 'несмотря_на': 'несмотря_на:acc', + 'относительно': 'относительно:gen', + 'по_мера': 'по_мере:gen', + 'по_отношение_ко?': 'по_отношению_к:dat', 'по_повод': 'по_поводу:gen', + 'по_сравнение_с': 'по_сравнению_с:ins', 'помимо': 'помимо:gen', 'при_помощь': 'при_помощи:gen', 'с_помощь': 'с_помощью:gen', From dffc958c983bd1563ce421822a6f900511aed7b9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Mar 2022 18:40:36 +0200 Subject: [PATCH 133/871] Fix edeprels. --- udapi/block/ud/cs/fixedeprels.py | 2 +- udapi/block/ud/ru/fixedeprels.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 7a49bb87..6f0258ed 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -20,7 +20,7 @@ class FixEdeprels(Block): 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole 'jako': [], 'jakoby': ['jakoby_pod:ins'], # these instances in FicTree should be spelled 'jako by' - 'než': [], + 'než': ['než_aby'], 'protože': [], 'takže': [], 'třebaže': [] diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index a5560121..79669d63 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -14,7 +14,9 @@ class FixEdeprels(Block): outermost = { 'будто': [], 'ведь': [], + 'если': [], 'как': ['как_только'], + 'нежели': [], 'раз': [], 'словно': [], 'так_что': [], @@ -35,6 +37,7 @@ class FixEdeprels(Block): 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', 'возле': 'возле:gen', + 'вплоть_до': 'вплоть_до:gen', 'до': 'до:gen', 'к': 'к:dat', 'несмотря_на': 'несмотря_на:acc', @@ -48,7 +51,8 @@ class FixEdeprels(Block): 'с_помощь': 'с_помощью:gen', 'со_сторона': 'со_стороны:gen', 'согласно': 'согласно:dat', - 'спустя': 'спустя:acc' + 'спустя': 'спустя:acc', + 'через': 'через:acc' } def copy_case_from_adposition(self, node, adposition): From 283fd9e7d0ef98f68626ca59b0a36e185539fca4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 4 Apr 2022 14:53:40 +0200 Subject: [PATCH 134/871] Czech o:gen is wrong. --- udapi/block/ud/cs/fixedeprels.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 6f0258ed..a7158d6b 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -312,6 +312,9 @@ def process_node(self, node): if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: + ###!!! CAC contains 'o' with genitive, which is wrong! + if m.group(1) == 'o' and adpcase == 'gen': + adpcase = 'acc' edep['deprel'] = m.group(1)+':'+adpcase continue if re.match(r'^(acl|advcl):', edep['deprel']): From f1a1d537e8637a51dc00a6d06e01cccf6898556b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 4 Apr 2022 15:24:37 +0200 Subject: [PATCH 135/871] Bug fix. --- udapi/block/ud/cs/fixedeprels.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index a7158d6b..f2f76b4b 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -311,10 +311,7 @@ def process_node(self, node): m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) - if adpcase: - ###!!! CAC contains 'o' with genitive, which is wrong! - if m.group(1) == 'o' and adpcase == 'gen': - adpcase = 'acc' + if adpcase and not re.search(r':(nom|gen|dat|voc)$', adpcase): edep['deprel'] = m.group(1)+':'+adpcase continue if re.match(r'^(acl|advcl):', edep['deprel']): From 702e9b138de59d3d8cc7267d205361cad5b34d65 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 6 Apr 2022 05:27:31 +0200 Subject: [PATCH 136/871] Rename cluster to entity (#104) * `CorefCluster` -> `CorefEntity` * `mention.cluster` -> `mention.entity` * `cluster.cluster_id` -> `entity.eid` * `cluster.cluster_type` -> `entity.etype` * `for cluster in doc.coref_clusters.values():` -> `for entity in doc.coref_entities:` * `for cluster_id, cluster in data.coref_clusters.items():` -> `for eid, entity in doc.eid_to_entity:` * `doc.coref_clusters` kept, but deprecated * rename also clusters/cluster -> entities/entity almost everywhere else (variable and parameter names, comments,...) * new udapi.core.block methods `process_coref_mention` and `process_coref_entity` --- udapi/block/corefud/fixcorefud02.py | 18 +- udapi/block/corefud/fixinterleaved.py | 10 +- udapi/block/corefud/gum2corefud.py | 38 +-- udapi/block/corefud/indexclusters.py | 21 +- udapi/block/corefud/load.py | 2 +- udapi/block/corefud/markcrossing.py | 6 +- udapi/block/corefud/markinterleaved.py | 10 +- udapi/block/corefud/marknested.py | 10 +- udapi/block/corefud/marksamesubspan.py | 10 +- udapi/block/corefud/mergesamespan.py | 32 +-- udapi/block/corefud/movehead.py | 26 +- .../{printclusters.py => printentities.py} | 26 +- udapi/block/corefud/printmentions.py | 8 +- udapi/block/corefud/stats.py | 32 +-- udapi/block/read/oldcorefud.py | 78 +++--- udapi/block/util/eval.py | 16 +- udapi/block/write/oldcorefud.py | 36 +-- udapi/core/block.py | 60 ++++- udapi/core/coref.py | 248 +++++++++--------- udapi/core/document.py | 52 ++-- udapi/core/node.py | 12 +- udapi/core/tests/test_coref.py | 10 +- 22 files changed, 413 insertions(+), 348 deletions(-) rename udapi/block/corefud/{printclusters.py => printentities.py} (69%) diff --git a/udapi/block/corefud/fixcorefud02.py b/udapi/block/corefud/fixcorefud02.py index b8fe44f7..1575cea6 100644 --- a/udapi/block/corefud/fixcorefud02.py +++ b/udapi/block/corefud/fixcorefud02.py @@ -20,32 +20,32 @@ def process_document(self, doc): if doc.meta['global.Entity'] == 'entity-GRP-infstat-MIN-coref_type-identity': doc.meta['global.Entity'] = 'eid-etype-head-other-infstat-minspan-identity' - for cluster in doc.coref_clusters.values(): - if cluster.cluster_type: + for entity in doc.coref_entities: + if entity.etype: # Harmonize etype. # If gen/spec is distinguished, store it in all mentions' other['gstype']. - etype = cluster.cluster_type.lower() + etype = entity.etype.lower() if etype.startswith('spec') or etype.startswith('gen'): gstype = 'gen' if etype.startswith('gen') else 'spec' - for m in cluster.mentions: + for m in entity.mentions: m.other['gstype'] = gstype if etype == 'spec': etype = 'other' etype = etype.replace('gen', '').replace('spec', '').replace('.', '') etype = NEW_ETYPE.get(etype, etype) - # cluster_type="APPOS" is used only in NONPUBL-CorefUD_English-OntoNotes. - # Apposition is a mention-based rather than cluster-based attribute. + # etype="APPOS" is used only in NONPUBL-CorefUD_English-OntoNotes. + # Apposition is a mention-based rather than entity-based attribute. # We don't know which of the mentions it should be assigned, but let's expect all non-first. # UD marks appositions with deprel appos, so once someone checks it is really redunant, # TODO we can delete the appos mention attribute. if etype == 'appos': etype = '' - for mention in cluster.mentions[1:]: + for mention in entity.mentions[1:]: mention.other['appos'] = '1' - cluster.cluster_type = etype + entity.etype = etype - for mention in cluster.mentions: + for mention in entity.mentions: # Harmonize bridge relation labels for bridge in mention.bridging: rel = bridge.relation.lower() diff --git a/udapi/block/corefud/fixinterleaved.py b/udapi/block/corefud/fixinterleaved.py index 6921c680..c5a1b3ed 100644 --- a/udapi/block/corefud/fixinterleaved.py +++ b/udapi/block/corefud/fixinterleaved.py @@ -5,10 +5,10 @@ class FixInterleaved(Block): """Fix mentions with interleaved or crossing spans.""" - def __init__(self, same_cluster_only=True, both_discontinuous=False, + def __init__(self, same_entity_only=True, both_discontinuous=False, crossing_only=False, nested_same_subspan=True, **kwargs): super().__init__(**kwargs) - self.same_cluster_only = same_cluster_only + self.same_entity_only = same_entity_only self.both_discontinuous = both_discontinuous self.crossing_only = crossing_only self.nested_same_subspan = nested_same_subspan @@ -22,7 +22,7 @@ def process_tree(self, tree): for mA, mB in itertools.combinations(mentions, 2): if mA in deleted or mB in deleted: continue - if self.same_cluster_only and mA.cluster != mB.cluster: + if self.same_entity_only and mA.entity != mB.entity: continue # Fully nested spans are OK, expect for same-subspan @@ -53,7 +53,7 @@ def process_tree(self, tree): except ValueError: pass try: - mB.cluster.mentions.remove(mB) + mB.entity.mentions.remove(mB) except ValueError: pass deleted.add(mB) @@ -75,7 +75,7 @@ def process_tree(self, tree): except ValueError: pass try: - mA.cluster.mentions.remove(mA) + mA.entity.mentions.remove(mA) except ValueError: pass break diff --git a/udapi/block/corefud/gum2corefud.py b/udapi/block/corefud/gum2corefud.py index bcd24968..bf6d798d 100644 --- a/udapi/block/corefud/gum2corefud.py +++ b/udapi/block/corefud/gum2corefud.py @@ -8,7 +8,7 @@ class Gum2CorefUD(Block): def process_tree(self, tree): docname = tree.bundle.document.meta['docname'] + '_' - clusters = tree.bundle.document.coref_clusters + eid_to_entity = tree.bundle.document._eid_to_entity unfinished_mentions = defaultdict(list) for node in tree.descendants: misc_entity = node.misc['Entity'] @@ -47,15 +47,15 @@ def process_tree(self, tree): else: raise ValueError(f"Less than 5 attributes in {entity} at {node}") name = docname + grp - cluster = clusters.get(name) - if cluster is None: - cluster = node.create_coref_cluster(cluster_id=name, cluster_type=etype) - mention = cluster.mentions[0] + entity = eid_to_entity.get(name) + if entity is None: + entity = node.create_coref_entity(eid=name, etype=etype) + mention = entity.mentions[0] mention.misc = f"Infstat:{infstat},MinSpan:{minspan},CorefType:{ctype}" if wiki: mention.misc += ',Wikification:' + wiki #.replace(',', '%2C') else: - mention = cluster.create_mention(head=node) + mention = entity.create_mention(head=node) if closing: mention.words = [node] else: @@ -71,23 +71,23 @@ def process_tree(self, tree): except ValueError as err: raise ValueError(f"{node}: {misc_bridge} {err}") try: - trg_cluster = clusters[trg_str] - src_cluster = clusters[src_str] + trg_entity = eid_to_entity[trg_str] + src_entity = eid_to_entity[src_str] except KeyError as err: - logging.warning(f"{node}: Cannot find cluster {err}") + logging.warning(f"{node}: Cannot find entity {err}") else: - mention = src_cluster.mentions[-1] + mention = src_entity.mentions[-1] # TODO: what relation should we choose for Bridging? # relation = f"{src_str.split('-')[0]}-{trg_str.split('-')[0]}" relation = '_' - mention.bridging.append((trg_cluster, relation)) + mention.bridging.append((trg_entity, relation)) del node.misc['Bridge'] misc_split = node.misc['Split'] if misc_split: # E.g. Entity=(person-54)|Split=4<54,9<54 src_str = docname + misc_split.split('<')[-1] - ante_clusters = [] + ante_entities = [] for x in misc_split.split(','): ante_str, this_str = [docname + grp for grp in x.split('<')] if this_str != src_str: @@ -96,16 +96,16 @@ def process_tree(self, tree): # There are just three such cases in GUM and all are bugs, # so let's ignore them entirely (the `else` clause will be skipped if exiting `for` w/ `break`). # break - ante_clusters.append(clusters[ante_str]) + ante_entities.append(eid_to_entity[ante_str]) else: - clusters[src_str].split_ante = ante_clusters + eid_to_entity[src_str].split_ante = ante_entities del node.misc['Split'] - for cluster_name, mentions in unfinished_mentions.items(): + for entity_name, mentions in unfinished_mentions.items(): for mention in mentions: logging.warning(f"Mention {name} opened at {mention.head}, but not closed in the same tree. Deleting.") - cluster = mention.cluster + entity = mention.entity mention.words = [] - cluster._mentions.remove(mention) - if not cluster._mentions: - del clusters[name] + entity._mentions.remove(mention) + if not entity._mentions: + del eid_to_entity[name] diff --git a/udapi/block/corefud/indexclusters.py b/udapi/block/corefud/indexclusters.py index 14cf778d..3f5d74d8 100644 --- a/udapi/block/corefud/indexclusters.py +++ b/udapi/block/corefud/indexclusters.py @@ -3,10 +3,10 @@ class IndexClusters(Block): - """Re-index the coreference cluster IDs. The final cluster IDs are of the "e" form, + """Re-index the coreference entity IDs (eid). The final entity IDs are of the "e" form, where are ordinal numbers starting from the one specified by the `start` parameter. This block can be applied on multiple documents within one udapy call. - For example, to re-index ClusterId in all conllu files in the current directory + For example, to re-index eid in all conllu files in the current directory (keeping the IDs unique across all the files), use: `udapy read.Conllu files='!*.conllu' corefud.IndexClusters write.Conllu overwrite=1` @@ -23,14 +23,13 @@ def __init__(self, start=1, prefix='e'): self.prefix = prefix def process_document(self, doc): - clusters = doc.coref_clusters - if not clusters: + entities = doc.coref_entities + if not entities: return - new_clusters = {} - for idx, cid in enumerate(clusters, self.start): - cluster = clusters[cid] - new_cid = self.prefix + str(idx) - cluster.cluster_id = new_cid - new_clusters[new_cid] = cluster + new_eid_to_entity = {} + for idx, entity in enumerate(entities, self.start): + new_eid = self.prefix + str(idx) + entity.eid = new_eid + new_eid_to_entity[new_eid] = entity self.start = idx + 1 - doc._coref_clusters = new_clusters + doc._eid_to_entity = new_eid_to_entity diff --git a/udapi/block/corefud/load.py b/udapi/block/corefud/load.py index 3b2534bc..92773dc2 100644 --- a/udapi/block/corefud/load.py +++ b/udapi/block/corefud/load.py @@ -8,5 +8,5 @@ def __init__(self, strict=True): self.strict = strict def process_document(self, doc): - if doc._coref_clusters is None: + if doc._eid_to_entity is None: udapi.core.coref.load_coref_from_misc(doc, self.strict) diff --git a/udapi/block/corefud/markcrossing.py b/udapi/block/corefud/markcrossing.py index a6d9346a..8064e67f 100644 --- a/udapi/block/corefud/markcrossing.py +++ b/udapi/block/corefud/markcrossing.py @@ -6,10 +6,10 @@ class MarkCrossing(Block): """Find mentions with crossing spans.""" - def __init__(self, same_cluster_only=False, continuous_only=False, print_form=False, + def __init__(self, same_entity_only=False, continuous_only=False, print_form=False, log=True, mark=True, **kwargs): super().__init__(**kwargs) - self.same_cluster_only = same_cluster_only + self.same_entity_only = same_entity_only self.continuous_only = continuous_only self.print_form = print_form self.log = log @@ -26,7 +26,7 @@ def process_node(self, node): if len(node.coref_mentions) > 1: for mA, mB in itertools.combinations(node.coref_mentions, 2): if not (set(mA.words) <= set(mB.words)) and not (set(mB.words) <= set(mA.words)): - if self.same_cluster_only and mA.cluster != mB.cluster: + if self.same_entity_only and mA.entity != mB.entity: continue if self.continuous_only and (',' in mA.span or ',' in mB.span): continue diff --git a/udapi/block/corefud/markinterleaved.py b/udapi/block/corefud/markinterleaved.py index ac4d9438..c00f73b1 100644 --- a/udapi/block/corefud/markinterleaved.py +++ b/udapi/block/corefud/markinterleaved.py @@ -5,10 +5,10 @@ class MarkInterleaved(Block): """Find mentions with interleaved spans.""" - def __init__(self, same_cluster_only=False, both_discontinuous=False, print_form=False, + def __init__(self, same_entity_only=False, both_discontinuous=False, print_form=False, log=True, mark=True, **kwargs): super().__init__(**kwargs) - self.same_cluster_only = same_cluster_only + self.same_entity_only = same_entity_only self.both_discontinuous = both_discontinuous self.print_form = print_form self.log = log @@ -16,9 +16,9 @@ def __init__(self, same_cluster_only=False, both_discontinuous=False, print_form def _print(self, mention): if self.print_form: - return mention.cluster.cluster_id + ':' + ' '.join([w.form for w in mention.words]) + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) else: - return mention.cluster.cluster_id + ':' + mention.span + return mention.entity.eid + ':' + mention.span def process_tree(self, tree): mentions = set() @@ -33,7 +33,7 @@ def process_tree(self, tree): continue if mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0]: continue - if self.same_cluster_only and mA.cluster != mB.cluster: + if self.same_entity_only and mA.entity != mB.entity: continue if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): continue diff --git a/udapi/block/corefud/marknested.py b/udapi/block/corefud/marknested.py index 656111c6..8db8a657 100644 --- a/udapi/block/corefud/marknested.py +++ b/udapi/block/corefud/marknested.py @@ -5,10 +5,10 @@ class MarkNested(Block): """Find nested mentions.""" - def __init__(self, same_cluster_only=True, both_discontinuous=False, multiword_only=False, + def __init__(self, same_entity_only=True, both_discontinuous=False, multiword_only=False, print_form=False, log=True, mark=True, **kwargs): super().__init__(**kwargs) - self.same_cluster_only = same_cluster_only + self.same_entity_only = same_entity_only self.both_discontinuous = both_discontinuous self.multiword_only = multiword_only self.print_form = print_form @@ -17,9 +17,9 @@ def __init__(self, same_cluster_only=True, both_discontinuous=False, multiword_o def _print(self, mention): if self.print_form: - return mention.cluster.cluster_id + ':' + ' '.join([w.form for w in mention.words]) + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) else: - return mention.cluster.cluster_id + ':' + mention.span + return mention.entity.eid + ':' + mention.span def process_tree(self, tree): mentions = set() @@ -27,7 +27,7 @@ def process_tree(self, tree): for m in node.coref_mentions: mentions.add(m) for mA, mB in itertools.combinations(mentions, 2): - if self.same_cluster_only and mA.cluster != mB.cluster: + if self.same_entity_only and mA.entity != mB.entity: continue if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): continue diff --git a/udapi/block/corefud/marksamesubspan.py b/udapi/block/corefud/marksamesubspan.py index f99e0e13..f3cfd7b3 100644 --- a/udapi/block/corefud/marksamesubspan.py +++ b/udapi/block/corefud/marksamesubspan.py @@ -5,10 +5,10 @@ class MarkSameSubSpan(Block): """Find mentions with the same subspan.""" - def __init__(self, same_cluster_only=False, both_discontinuous=False, print_form=False, nested_only=False, + def __init__(self, same_entity_only=False, both_discontinuous=False, print_form=False, nested_only=False, log=True, mark=True, **kwargs): super().__init__(**kwargs) - self.same_cluster_only = same_cluster_only + self.same_entity_only = same_entity_only self.both_discontinuous = both_discontinuous self.nested_only = nested_only self.print_form = print_form @@ -17,9 +17,9 @@ def __init__(self, same_cluster_only=False, both_discontinuous=False, print_form def _print(self, mention): if self.print_form: - return mention.cluster.cluster_id + ':' + ' '.join([w.form for w in mention.words]) + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) else: - return mention.cluster.cluster_id + ':' + mention.span + return mention.entity.eid + ':' + mention.span def process_tree(self, tree): mentions = set() @@ -28,7 +28,7 @@ def process_tree(self, tree): mentions.add(m) if len(mentions) > 1: for mA, mB in itertools.combinations(mentions, 2): - if self.same_cluster_only and mA.cluster != mB.cluster: + if self.same_entity_only and mA.entity != mB.entity: continue if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): continue diff --git a/udapi/block/corefud/mergesamespan.py b/udapi/block/corefud/mergesamespan.py index bdeefd7c..61b613cb 100644 --- a/udapi/block/corefud/mergesamespan.py +++ b/udapi/block/corefud/mergesamespan.py @@ -11,9 +11,9 @@ class MergeSameSpan(Block): CorefUD data, so this block processes one sentence at a time. """ - def __init__(self, same_cluster_only=False, **kwargs): + def __init__(self, same_entity_only=False, **kwargs): super().__init__(**kwargs) - self.same_cluster_only = same_cluster_only + self.same_entity_only = same_entity_only def process_tree(self, tree): mentions = set() @@ -22,31 +22,31 @@ def process_tree(self, tree): mentions.add(m) for mA, mB in itertools.combinations(mentions, 2): - if self.same_cluster_only and mA.cluster != mB.cluster: + if self.same_entity_only and mA.entity != mB.entity: continue # Reduce non-determinism in which mention is removed: - # If the mentions belong to different entities, sort them by entity (cluster) ids. - if mA.cluster.cluster_id > mB.cluster.cluster_id: + # If the mentions belong to different entities, sort them by entity (entity) ids. + if mA.entity.eid > mB.entity.eid: mA, mB = mB, mA sA, sB = set(mA.words), set(mB.words) if sA != sB: continue - # If the mentions belong to different clusters, we should merge the - # clusters first, i.e., pick one cluster as the survivor, move the - # mentions from the other cluster to this cluster, and remove the - # other cluster. - if mA.cluster != mB.cluster: - logging.warning(f"Merging same-span mentions that belong to different entities: {mA.cluster.cluster_id} vs. {mB.cluster.cluster_id}") - ###!!! TODO: As of now, changing the cluster of a mention is not supported in the API. - #for m in mB.cluster.mentions: - # m.cluster = mA.cluster + # If the mentions belong to different entities, we should merge the + # entities first, i.e., pick one entity as the survivor, move the + # mentions from the other entity to this entity, and remove the + # other entity. + if mA.entity != mB.entity: + logging.warning(f"Merging same-span mentions that belong to different entities: {mA.entity.eid} vs. {mB.entity.eid}") + ###!!! TODO: As of now, changing the entity of a mention is not supported in the API. + #for m in mB.entity.mentions: + # m.entity = mA.entity # Remove mention B. It may have been removed earlier because of # another duplicate, that is the purpose of try-except. - ###!!! TODO: If we remove a singleton, we are destroying the cluster. Then we must also handle possible bridging and split antecedents pointing to that cluster! + ###!!! TODO: If we remove a singleton, we are destroying the entity. Then we must also handle possible bridging and split antecedents pointing to that entity! mB.words = [] try: - mB.cluster.mentions.remove(mB) + mB.entity.mentions.remove(mB) except ValueError: pass diff --git a/udapi/block/corefud/movehead.py b/udapi/block/corefud/movehead.py index 2a38bd82..00a32e9f 100644 --- a/udapi/block/corefud/movehead.py +++ b/udapi/block/corefud/movehead.py @@ -75,20 +75,18 @@ def find_head(self, mention): # Finally, return the word-order-wise first head candidate as the head. return enh_heads[0], 'nontreelet' - def process_document(self, doc): - for cluster in doc.coref_clusters.values(): - for mention in cluster.mentions: - self.counter['total'] += 1 - if len(mention.words) < 2: - self.counter['single-word'] += 1 - else: - new_head, category = self.find_head(mention) - self.counter[category] += 1 - if new_head is mention.head: - self.counter[category + '-kept'] += 1 - else: - self.counter[category + '-moved'] += 1 - mention.head = new_head + def process_coref_mention(self, mention): + self.counter['total'] += 1 + if len(mention.words) < 2: + self.counter['single-word'] += 1 + else: + new_head, category = self.find_head(mention) + self.counter[category] += 1 + if new_head is mention.head: + self.counter[category + '-kept'] += 1 + else: + self.counter[category + '-moved'] += 1 + mention.head = new_head def process_end(self): logging.info("corefud.MoveHead overview of mentions:") diff --git a/udapi/block/corefud/printclusters.py b/udapi/block/corefud/printentities.py similarity index 69% rename from udapi/block/corefud/printclusters.py rename to udapi/block/corefud/printentities.py index 7271ae78..7230c6a5 100644 --- a/udapi/block/corefud/printclusters.py +++ b/udapi/block/corefud/printentities.py @@ -3,20 +3,20 @@ from udapi.core.block import Block from collections import Counter, defaultdict -class PrintClusters(Block): - """Block corefud.PrintClusters prints all mentions of a given cluster.""" +class PrintEntities(Block): + """Block corefud.PrintEntities prints all mentions of a given entity.""" - def __init__(self, id_re=None, min_mentions=0, print_ranges=True, mark_head=True, + def __init__(self, eid_re=None, min_mentions=0, print_ranges=True, mark_head=True, aggregate_mentions=True, **kwargs): """Params: - id_re: regular expression constraining ClusterId of the clusters to be printed - min_mentions: print only clusters with with at least N mentions + eid_re: regular expression constraining ID of the entities to be printed + min_mentions: print only entities with with at least N mentions print_ranges: print also addressess of all mentions (compactly, using the longest common prefix of sent_id) mark_head: mark the head (e.g. as "red **car**") """ super().__init__(**kwargs) - self.id_re = re.compile(str(id_re)) if id_re else None + self.eid_re = re.compile(str(eid_re)) if eid_re else None self.min_mentions = min_mentions self.print_ranges = print_ranges self.mark_head = mark_head @@ -24,17 +24,17 @@ def __init__(self, id_re=None, min_mentions=0, print_ranges=True, mark_head=True def process_document(self, doc): if 'docname' in doc.meta: - print(f"Coref clusters in document {doc.meta['docname']}:") - for cluster in doc.coref_clusters.values(): - if self.id_re and not self.id_re.match(cluster.cluster_id): + print(f"Coref entities in document {doc.meta['docname']}:") + for entity in doc.coref_entities: + if self.eid_re and not self.eid_re.match(entity.eid): continue - if len(cluster.mentions) < self.min_mentions: + if len(entity.mentions) < self.min_mentions: continue - print(f" {cluster.cluster_id} has {len(cluster.mentions)} mentions:") + print(f" {entity.eid} has {len(entity.mentions)} mentions:") if self.aggregate_mentions: counter = Counter() ranges = defaultdict(list) - for mention in cluster.mentions: + for mention in entity.mentions: forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words]) counter[forms] += 1 if self.print_ranges: @@ -48,7 +48,7 @@ def process_document(self, doc): prefix = os.path.commonprefix(ranges[form]) print(f' {prefix} ({" ".join(f[len(prefix):] for f in ranges[form])})') else: - for mention in cluster.mentions: + for mention in entity.mentions: forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words]) print(' ' + forms) if self.print_ranges: diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index e26ee6e2..7ed31b0d 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -99,9 +99,9 @@ def _is_almost_continuous(self, mention): def process_document(self, doc): mentions = [] - for cluster in doc.coref_clusters.values(): - if self._ok(len(cluster.mentions) == 1, self.singleton): - mentions.extend(cluster.mentions) + for entity in doc.coref_entities: + if self._ok(len(entity.mentions) == 1, self.singleton): + mentions.extend(entity.mentions) if self.shuffle: random.shuffle(mentions) else: @@ -146,7 +146,7 @@ def process_document(self, doc): print("# Mention = " + this_form) if self.print_other_forms: counter = Counter() - for m in mention.cluster.mentions: + for m in mention.entity.mentions: forms = ' '.join([w.form for w in m.words]) if forms != this_form: counter[forms] += 1 diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py index e39195db..cdd84e7a 100644 --- a/udapi/block/corefud/stats.py +++ b/udapi/block/corefud/stats.py @@ -4,14 +4,14 @@ class Stats(Block): """Block corefud.Stats prints various coreference-related statistics.""" - def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_clusters=True, + def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_entities=True, report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM', exclude_singletons=False, exclude_nonsingletons=False, style='human', **kwargs): super().__init__(**kwargs) self.m_len_max = m_len_max self.c_len_max = c_len_max self.report_mentions = report_mentions - self.report_clusters = report_clusters + self.report_entities = report_entities self.report_details = report_details self.exclude_singletons = exclude_singletons self.exclude_nonsingletons = exclude_nonsingletons @@ -21,29 +21,29 @@ def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_cluste self.counter = Counter() self.mentions = 0 - self.clusters = 0 + self.entities = 0 self.total_nodes = 0 self.longest_mention = 0 - self.longest_cluster = 0 + self.longest_entity = 0 self.m_words = 0 self.selected_upos = None if selected_upos == 'all' else selected_upos.split() def process_document(self, doc): self.total_nodes += len(list(doc.nodes)) - for cluster in doc.coref_clusters.values(): - len_mentions = len(cluster.mentions) + for entity in doc.coref_entities: + len_mentions = len(entity.mentions) if len_mentions == 1 and self.exclude_singletons: continue elif len_mentions > 1 and self.exclude_nonsingletons: continue - self.longest_cluster = max(len_mentions, self.longest_cluster) + self.longest_entity = max(len_mentions, self.longest_entity) self.counter['c_total_len'] += len_mentions self.counter[f"c_len_{min(len_mentions, self.c_len_max)}"] += 1 - self.clusters += 1 + self.entities += 1 if not self.report_mentions and not self.report_details: continue - for mention in cluster.mentions: + for mention in entity.mentions: self.mentions += 1 all_words = len(mention.words) non_empty = len([w for w in mention.words if not w.is_empty()]) @@ -68,17 +68,17 @@ def process_document(self, doc): def process_end(self): mentions_nonzero = 1 if self.mentions == 0 else self.mentions - clusters_nonzero = 1 if self.clusters == 0 else self.clusters + entities_nonzero = 1 if self.entities == 0 else self.entities total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes columns =[ ] - if self.report_clusters: - columns += [('clusters', f"{self.clusters:7,}"), - ('clusters_per1k', f"{1000 * self.clusters / total_nodes_nonzero:6.0f}"), - ('longest_cluster', f"{self.longest_cluster:6}"), - ('avg_cluster', f"{self.counter['c_total_len'] / clusters_nonzero:5.1f}")] + if self.report_entities: + columns += [('entities', f"{self.entities:7,}"), + ('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"), + ('longest_entity', f"{self.longest_entity:6}"), + ('avg_entity', f"{self.counter['c_total_len'] / entities_nonzero:5.1f}")] for i in range(1, self.c_len_max + 1): - percent = 100 * self.counter[f"c_len_{i}"] / clusters_nonzero + percent = 100 * self.counter[f"c_len_{i}"] / entities_nonzero columns.append((f"c_len_{i}{'' if i < self.c_len_max else '+'}", f"{percent:5.1f}")) if self.report_mentions: columns += [('mentions', f"{self.mentions:7,}"), diff --git a/udapi/block/read/oldcorefud.py b/udapi/block/read/oldcorefud.py index 539d5036..73e05f3b 100644 --- a/udapi/block/read/oldcorefud.py +++ b/udapi/block/read/oldcorefud.py @@ -2,7 +2,7 @@ import re import logging import udapi.block.read.conllu -from udapi.core.coref import CorefCluster, CorefMention, BridgingLinks +from udapi.core.coref import CorefEntity, CorefMention, BridgingLinks class OldCorefUD(udapi.block.read.conllu.Conllu): @@ -11,7 +11,7 @@ def __init__(self, replace_hyphen_in_id_with='', **kwargs): Args: substitute_hyphen_in_id_for: string to use as a replacement for hyphens in ClusterId - The new format does not allow hyphens in eid (IDs of entity clusters), + The new format does not allow hyphens in eid (IDs of entity entities), so we need to replace them. """ super().__init__(**kwargs) @@ -36,27 +36,27 @@ def _fix_id(self, cid): def process_document(self, doc, strict=True): super().process_document(doc) - clusters = {} + eid_to_entity = {} for node in doc.nodes_and_empty: index, index_str = 0, "" - cluster_id = node.misc["ClusterId"] - if not cluster_id: + eid = node.misc["ClusterId"] + if not eid: index, index_str = 1, "[1]" - cluster_id = node.misc["ClusterId[1]"] - cluster_id = self._fix_id(cluster_id) - while cluster_id: - cluster = clusters.get(cluster_id) - if cluster is None: - cluster = CorefCluster(cluster_id) - clusters[cluster_id] = cluster - mention = CorefMention(words=[node], cluster=cluster) + eid = node.misc["ClusterId[1]"] + eid = self._fix_id(eid) + while eid: + entity = eid_to_entity.get(eid) + if entity is None: + entity = CorefEntity(eid) + eid_to_entity[eid] = entity + mention = CorefMention(words=[node], entity=entity) if node.misc["MentionSpan" + index_str]: mention.span = node.misc["MentionSpan" + index_str] - cluster_type = node.misc["ClusterType" + index_str] - if cluster_type: - if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: - logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") - cluster.cluster_type = cluster_type + etype = node.misc["ClusterType" + index_str] + if etype: + if entity.etype is not None and etype != entity.etype: + logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}") + entity.etype = etype bridging_str = node.misc["Bridging" + index_str] if bridging_str: @@ -64,11 +64,11 @@ def process_document(self, doc, strict=True): for link_str in bridging_str.split(','): target, relation = link_str.split(':') target = self._fix_id(target) - if target == cluster_id: - _error("Bridging cannot self-reference the same cluster: " + target, strict) - if target not in clusters: - clusters[target] = CorefCluster(target) - mention._bridging.append((clusters[target], relation)) + if target == eid: + _error("Bridging cannot self-reference the same entity: " + target, strict) + if target not in eid_to_entity: + eid_to_entity[target] = CorefEntity(target) + mention._bridging.append((eid_to_entity[target], relation)) split_ante_str = node.misc["SplitAnte" + index_str] if split_ante_str: @@ -77,16 +77,16 @@ def process_document(self, doc, strict=True): # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator. for ante_str in split_ante_str.replace('+', ',').split(','): ante_str = self._fix_id(ante_str) - if ante_str in clusters: - if ante_str == cluster_id: - _error("SplitAnte cannot self-reference the same cluster: " + cluster_id, strict) - split_antes.append(clusters[ante_str]) + if ante_str in eid_to_entity: + if ante_str == eid: + _error("SplitAnte cannot self-reference the same entity: " + eid, strict) + split_antes.append(eid_to_entity[ante_str]) else: # split cataphora, e.g. "We, that is you and me..." - ante_cl = CorefCluster(ante_str) - clusters[ante_str] = ante_cl + ante_cl = CorefEntity(ante_str) + eid_to_entity[ante_str] = ante_cl split_antes.append(ante_cl) - cluster.split_ante = sorted(split_antes) + entity.split_ante = sorted(split_antes) # Some CorefUD 0.2 datasets (e.g. ARRAU) separate key-value pairs with spaces instead of commas. # We also need to escape forbidden characters. @@ -94,16 +94,16 @@ def process_document(self, doc, strict=True): mention.other = mmisc.replace('-', '%2D').replace('(', '%28').replace(')', '%29') index += 1 index_str = f"[{index}]" - cluster_id = self._fix_id(node.misc["ClusterId" + index_str]) - # c=doc.coref_clusters should be sorted, so that c[0] < c[1] etc. - # In other words, the dict should be sorted by the values (according to CorefCluster.__lt__), - # not by the keys (cluster_id). + eid = self._fix_id(node.misc["ClusterId" + index_str]) + # c=doc.coref_entities should be sorted, so that c[0] < c[1] etc. + # In other words, the dict should be sorted by the values (according to CorefEntity.__lt__), + # not by the keys (eid). # In Python 3.7+ (3.6+ in CPython), dicts are guaranteed to be insertion order. - for cluster in clusters.values(): - if not cluster._mentions: - _error(f"Cluster {cluster.cluster_id} referenced in SplitAnte or Bridging, but not defined with ClusterId", strict) - cluster._mentions.sort() - doc._coref_clusters = {c._cluster_id: c for c in sorted(clusters.values())} + for entity in eid_to_entity.values(): + if not entity._mentions: + _error(f"Entity {entity.eid} referenced in SplitAnte or Bridging, but not defined with ClusterId", strict) + entity._mentions.sort() + doc._eid_to_entity = {c._eid: c for c in sorted(eid_to_entity.values())} # Delete all old-style attributes from MISC (so when converting old to new style, the old attributes are deleted). attrs = "ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() diff --git a/udapi/block/util/eval.py b/udapi/block/util/eval.py index 07eab681..0f80d018 100644 --- a/udapi/block/util/eval.py +++ b/udapi/block/util/eval.py @@ -29,7 +29,7 @@ class Eval(Block): # pylint: disable=too-many-arguments,too-many-instance-attributes def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end=None, before_doc=None, after_doc=None, before_bundle=None, after_bundle=None, - coref_mention=None, coref_cluster=None, + coref_mention=None, coref_entity=None, expand_code=True, **kwargs): super().__init__(**kwargs) self.doc = doc @@ -43,7 +43,7 @@ def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end= self.before_bundle = before_bundle self.after_bundle = after_bundle self.coref_mention = coref_mention - self.coref_cluster = coref_cluster + self.coref_entity = coref_entity self.expand_code = expand_code self.count = collections.Counter() @@ -74,13 +74,13 @@ def process_document(self, document): # TODO if self._should_process_bundle(bundle): self.process_bundle(bundle) - if self.coref_cluster or self.coref_mention: - for cluster in doc.coref_clusters.values(): - if self.coref_cluster: - this = cluster - exec(self.expand_eval_code(self.coref_cluster)) + if self.coref_entity or self.coref_mention: + for entity in doc.coref_entities: + if self.coref_entity: + this = entity + exec(self.expand_eval_code(self.coref_entity)) if self.coref_mention: - for mention in cluster.mentions: + for mention in entity.mentions: this = mention exec(self.expand_eval_code(self.coref_mention)) diff --git a/udapi/block/write/oldcorefud.py b/udapi/block/write/oldcorefud.py index 4eb316bb..49f9beb0 100644 --- a/udapi/block/write/oldcorefud.py +++ b/udapi/block/write/oldcorefud.py @@ -6,7 +6,7 @@ class OldCorefUD(udapi.block.write.conllu.Conllu): def process_document(self, doc): - if not doc.coref_clusters: + if not doc.coref_entities: logging.warning("Using write.OldCorefUD on a document without any coreference annotation") # Delete both new-style (GUM-style) and old-style (CorefUD 0.1) coreference annotations from MISC. @@ -17,19 +17,19 @@ def process_document(self, doc): del node.misc[key] del doc.meta['global.Entity'] - # doc._coref_clusters is a dict, which is insertion ordered in Python 3.7+. - # The insertion order is sorted according to CorefCluster.__lt__ (see few lines above). - # However, new clusters could be added meanwhile or some clusters edited, - # so we need to sort the clusters again before storing to MISC. - # We also need to mare sure cluster.mentions are sorted in each cluster - # because the ordering of clusters is defined by the first mention in each cluster. - # Ordering of mentions within a cluster can be changed when e.g. changing the span + # doc._eid_to_entity is a dict, which is insertion ordered in Python 3.7+. + # The insertion order is sorted according to CorefEntity.__lt__ (see few lines above). + # However, new entities could be added meanwhile or some entities edited, + # so we need to sort the entities again before storing to MISC. + # We also need to mare sure entity.mentions are sorted in each entity + # because the ordering of entities is defined by the first mention in each entity. + # Ordering of mentions within a entity can be changed when e.g. changing the span # of a given mention or reordering words within a sentence and in such events - # Udapi currently does not automatically update the ordering of clusters. - for cluster in doc._coref_clusters.values(): - cluster._mentions.sort() - for cluster in sorted(doc._coref_clusters.values()): - for mention in cluster.mentions: + # Udapi currently does not automatically update the ordering of entities. + for entity in doc.coref_entities: + entity._mentions.sort() + for entity in sorted(doc.coref_entities): + for mention in entity.mentions: head = mention.head if head.misc["ClusterId"]: for a in attrs: @@ -44,13 +44,13 @@ def process_document(self, doc): index_str = f"[{index}]" if index == 1: index_str = "" - head.misc["ClusterId" + index_str] = cluster.cluster_id + head.misc["ClusterId" + index_str] = entity.eid head.misc["MentionSpan" + index_str] = mention.span - head.misc["ClusterType" + index_str] = cluster.cluster_type + head.misc["ClusterType" + index_str] = entity.etype if mention._bridging: - head.misc["Bridging" + index_str] = ','.join(f'{l.target.cluster_id}:{l.relation}' for l in sorted(mention.bridging)) - if cluster.split_ante: - serialized = ','.join((c.cluster_id for c in sorted(cluster.split_ante))) + head.misc["Bridging" + index_str] = ','.join(f'{l.target.eid}:{l.relation}' for l in sorted(mention.bridging)) + if entity.split_ante: + serialized = ','.join((c.eid for c in sorted(entity.split_ante))) head.misc["SplitAnte" + index_str] = serialized if mention.other: head.misc["MentionMisc" + index_str] = str(mention.other).replace('%2D', '-') diff --git a/udapi/core/block.py b/udapi/core/block.py index 32033cde..f039abce 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -1,6 +1,9 @@ """Block class represents the basic Udapi processing unit.""" import logging +def not_overridden(method): + method.is_not_overridden = True + return method class Block(object): """The smallest processing unit for processing Universal Dependencies data. @@ -23,10 +26,12 @@ def process_end(self): """A hook method that is executed after processing all UD data""" pass + @not_overridden def process_node(self, _): """Process a UD node""" - raise Exception("No processing activity defined in block " + str(self)) + pass + @not_overridden def process_tree(self, tree): """Process a UD tree""" # tree.descendants is slightly slower than tree._descendants (0.05s per iterating over 700k words), @@ -36,6 +41,7 @@ def process_tree(self, tree): for node in tree.descendants: self.process_node(node) + @not_overridden def process_bundle(self, bundle): """Process a UD bundle""" for tree in bundle: @@ -54,10 +60,54 @@ def apply_on_document(self, document): def process_document(self, document): """Process a UD document""" - for bundle_no, bundle in enumerate(document.bundles, 1): - logging.debug('Block %s processing bundle #%d (id=%s)', - self.__class__.__name__, bundle_no, bundle.bundle_id) - self.process_bundle(bundle) + # Calling document.coref_entities is expensive because + # it needs to deserialize coref_entities from the MISC attributes. + # If no block in a scenario needs to process coreference entities/mentions, + # the deserialization does not need to be done. + # So we need to detect if any of the methods process_coref_entity and process_coref_mention + # has been overriden (without calling them, which could have adverse side effects). + # Let's use method annotations for this. + p_entity = not hasattr(self.process_coref_entity, 'is_not_overridden') + p_mention = not hasattr(self.process_coref_mention, 'is_not_overridden') + p_bundle = not hasattr(self.process_bundle, 'is_not_overridden') + p_tree = not hasattr(self.process_tree, 'is_not_overridden') + p_node = not hasattr(self.process_node, 'is_not_overridden') + if not any((p_entity, p_mention, p_bundle, p_tree, p_node)): + raise Exception("No processing activity defined in block " + str(self)) + + if p_entity or p_mention: + for entity in document.coref_entities: + if p_entity: + self.process_coref_entity(entity) + else: + for mention in entity.mentions: + self.process_coref_mention(mention) + + if p_bundle or p_tree or p_node: + for bundle_no, bundle in enumerate(document.bundles, 1): + logging.debug('Block %s processing bundle #%d (id=%s)', + self.__class__.__name__, bundle_no, bundle.bundle_id) + if p_bundle: + self.process_bundle(bundle) + else: + for tree in bundle: + if self._should_process_tree(tree): + if p_tree: + self.process_tree(tree) + else: + for node in tree.descendants: + self.process_node(node) + + @not_overridden + def process_coref_entity(self, entity): + """This method is called on each coreference entity in the document.""" + for mention in entity.mentions: + self.process_coref_mention(mention) + + @not_overridden + def process_coref_mention(self, mention): + """This method is called on each coreference mention in the document.""" + pass def before_process_document(self, document): """This method is called before each process_document.""" diff --git a/udapi/core/coref.py b/udapi/core/coref.py index eef25dd2..ff66c77f 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -107,21 +107,21 @@ @functools.total_ordering class CorefMention(object): """Class for representing a mention (instance of an entity).""" - __slots__ = ['_head', '_cluster', '_bridging', '_words', '_other'] + __slots__ = ['_head', '_entity', '_bridging', '_words', '_other'] - def __init__(self, words, head=None, cluster=None, add_word_backlinks=True): + def __init__(self, words, head=None, entity=None, add_word_backlinks=True): if not words: raise ValueError("mention.words must be non-empty") self._head = head if head else words[0] - self._cluster = cluster - if cluster is not None: - cluster._mentions.append(self) + self._entity = entity + if entity is not None: + entity._mentions.append(self) self._bridging = None self._other = None self._words = words if add_word_backlinks: for new_word in words: - if not new_word._mentions or not cluster or self > new_word._mentions[-1]: + if not new_word._mentions or not entity or self > new_word._mentions[-1]: new_word._mentions.append(self) else: new_word._mentions.append(self) @@ -141,8 +141,8 @@ def __lt__(self, another): their order is defined by the order of the last word in their span. For example precedes . - The order of two same-span mentions is currently defined by their cluster_id. - There should be no same-span (or same-subspan) same-cluster mentions. + The order of two same-span mentions is currently defined by their eid. + There should be no same-span (or same-subspan) same-entity mentions. """ #TODO: no mention.words should be handled already when loading if not self._words: @@ -159,7 +159,7 @@ def __lt__(self, another): return True if another._words[-1].precedes(self._words[-1]): return False - return self._cluster.cluster_id < another._cluster.cluster_id + return self._entity.eid < another._entity.eid return self._words[0].precedes(another._words[0]) @property @@ -186,15 +186,15 @@ def head(self, new_head): self._head = new_head @property - def cluster(self): - return self._cluster + def entity(self): + return self._entity - @cluster.setter - def cluster(self, new_cluster): - if self._cluster is not None: - raise NotImplementedError('changing the cluster of a mention not supported yet') - self._cluster = new_cluster - new_cluster._mentions.append(new_cluster) + @entity.setter + def entity(self, new_entity): + if self._entity is not None: + raise NotImplementedError('changing the entity of a mention not supported yet') + self._entity = new_entity + new_entity._mentions.append(new_entity) @property def bridging(self): @@ -216,7 +216,7 @@ def words(self): @words.setter def words(self, new_words): if new_words and self.head not in new_words: - raise ValueError(f"Head {self.head} not in new_words {new_words} for {self._cluster.cluster_id}") + raise ValueError(f"Head {self.head} not in new_words {new_words} for {self._entity.eid}") kept_words = [] # Make sure each word is included just once and they are in the correct order. new_words = sorted(list(set(new_words))) @@ -247,44 +247,44 @@ def span(self, new_span): @functools.total_ordering -class CorefCluster(object): +class CorefEntity(object): """Class for representing all mentions of a given entity.""" - __slots__ = ['_cluster_id', '_mentions', 'cluster_type', 'split_ante'] + __slots__ = ['_eid', '_mentions', 'etype', 'split_ante'] - def __init__(self, cluster_id, cluster_type=None): - self._cluster_id = None # prepare the _cluster_id slot - self.cluster_id = cluster_id # call the setter and check the ID is valid + def __init__(self, eid, etype=None): + self._eid = None # prepare the _eid slot + self.eid = eid # call the setter and check the ID is valid self._mentions = [] - self.cluster_type = cluster_type + self.etype = etype self.split_ante = [] def __lt__(self, another): - """Does this CorefCluster precedes (word-order wise) `another` cluster? + """Does this CorefEntity precedes (word-order wise) `another` entity? - This method defines a total ordering of all clusters - by the first mention of each cluster (see `CorefMention.__lt__`). - If one of the clusters has no mentions (which should not happen normally), + This method defines a total ordering of all entities + by the first mention of each entity (see `CorefMention.__lt__`). + If one of the entities has no mentions (which should not happen normally), there is a backup solution (see the source code). - If cluster IDs are not important, it is recommended to use block - `corefud.IndexClusters` to re-name cluster IDs in accordance with this cluster ordering. + If entity IDs are not important, it is recommended to use block + `corefud.IndexClusters` to re-name entity IDs in accordance with this entity ordering. """ if not self._mentions or not another._mentions: - # Clusters without mentions should go first, so the ordering is total. - # If both clusters are missing mentions, let's use cluster_id, so the ordering is stable. + # Entities without mentions should go first, so the ordering is total. + # If both entities are missing mentions, let's use eid, so the ordering is stable. if not self._mentions and not another._mentions: - return self._cluster_id < another._cluster_id + return self._eid < another._eid return not self._mentions return self._mentions[0] < another._mentions[0] @property - def cluster_id(self): - return self._cluster_id + def eid(self): + return self._eid - @cluster_id.setter - def cluster_id(self, new_cluster_id): - if any(x in new_cluster_id for x in CHARS_FORBIDDEN_IN_ID): - raise ValueError(f"{new_cluster_id} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]") - self._cluster_id = new_cluster_id + @eid.setter + def eid(self, new_eid): + if any(x in new_eid for x in CHARS_FORBIDDEN_IN_ID): + raise ValueError(f"{new_eid} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]") + self._eid = new_eid @property def eid_or_grp(self): @@ -292,18 +292,18 @@ def eid_or_grp(self): meta = root.document.meta if 'GRP' in meta['global.Entity'] and meta['tree2docid']: docid = meta['tree2docid'][root] - if self._cluster_id.startswith(docid): - return self._cluster_id.replace(docid, '', 1) + if self._eid.startswith(docid): + return self._eid.replace(docid, '', 1) else: - logging.warning(f"GRP in global.Entity, but eid={self._cluster_id} does not start with docid={docid}") - return self._cluster_id + logging.warning(f"GRP in global.Entity, but eid={self._eid} does not start with docid={docid}") + return self._eid @property def mentions(self): return self._mentions def create_mention(self, head=None, words=None, span=None): - """Create a new CoreferenceMention object within this CorefCluster. + """Create a new CoreferenceMention object within this CorefEntity. Args: head: a node where the annotation about this CorefMention will be stored in MISC. @@ -330,7 +330,7 @@ def create_mention(self, head=None, words=None, span=None): if head is None: head = words[0] - mention = CorefMention(words=[head], head=head, cluster=self) + mention = CorefMention(words=[head], head=head, entity=self) if words: mention.words = words if span: @@ -353,7 +353,7 @@ def all_bridging(self): # from dataclasses import dataclass # @dataclass # class DataClassCard: -# target: CorefCluster +# target: CorefEntity # relation: str class BridgingLink: __slots__ = ['target', 'relation'] @@ -374,9 +374,9 @@ class BridgingLinks(collections.abc.MutableSequence): Example usage: >>> bl = BridgingLinks(src_mention) # empty links >>> bl = BridgingLinks(src_mention, [(c12, 'part'), (c56, 'subset')]) # from a list of tuples - >>> (bl8, bl9) = BridgingLinks.from_string('c12>> for cluster, relation in bl: - >>> print(f"{bl.src_mention} ->{relation}-> {cluster.cluster_id}") + >>> (bl8, bl9) = BridgingLinks.from_string('c12>> for entity, relation in bl: + >>> print(f"{bl.src_mention} ->{relation}-> {entity.eid}") >>> print(str(bl)) # c12>> bl('part').targets == [c12] >>> bl('part|subset').targets == [c12, c56] @@ -384,9 +384,9 @@ class BridgingLinks(collections.abc.MutableSequence): """ @classmethod - def from_string(cls, string, clusters, node, strict=True, tree2docid=None): + def from_string(cls, string, entities, node, strict=True, tree2docid=None): """Return a sequence of BridgingLink objects representing a given string serialization. - The bridging links are also added to the mentions (`mention.bridging`) in the supplied `clusters`, + The bridging links are also added to the mentions (`mention.bridging`) in the supplied `entities`, so the returned sequence can be usually ignored. If `tree2docid` parameter is provided (mapping trees to document IDs used as prefixes in eid), the entity IDs in the provided string are interpreted as "GRP", i.e. as document-wide IDs, @@ -403,17 +403,17 @@ def from_string(cls, string, clusters, node, strict=True, tree2docid=None): if ':' in src_str: src_str, relation = src_str.split(':', 1) if trg_str == src_str: - _error(f"Bridge cannot self-reference the same cluster {trg_str} at {node}", strict) + _error(f"Bridge cannot self-reference the same entity {trg_str} at {node}", strict) if tree2docid: src_str = tree2docid[node.root] + src_str trg_str = tree2docid[node.root] + trg_str bl = src_str2bl.get(src_str) if not bl: - bl = clusters[src_str].mentions[-1].bridging + bl = entities[src_str].mentions[-1].bridging src_str2bl[src_str] = bl - if trg_str not in clusters: - clusters[trg_str] = CorefCluster(trg_str) - bl._data.append(BridgingLink(clusters[trg_str], relation)) + if trg_str not in entities: + entities[trg_str] = CorefEntity(trg_str) + bl._data.append(BridgingLink(entities[trg_str], relation)) return src_str2bl.values() def __init__(self, src_mention, value=None, strict=True): @@ -423,8 +423,8 @@ def __init__(self, src_mention, value=None, strict=True): if value is not None: if isinstance(value, collections.abc.Sequence): for v in value: - if v[0] is src_mention._cluster: - _error("Bridging cannot self-reference the same cluster: " + v[0].cluster_id, strict) + if v[0] is src_mention._entity: + _error("Bridging cannot self-reference the same entity: " + v[0].eid, strict) self._data.append(BridgingLink(v[0], v[1])) else: raise ValueError(f"Unknown value type: {type(value)}") @@ -439,21 +439,21 @@ def __len__(self): # TODO delete backlinks of old links, dtto for SplitAnte def __setitem__(self, key, new_value): - if new_value[0] is self.src_mention._cluster: - _error("Bridging cannot self-reference the same cluster: " + new_value[0].cluster_id, self.strict) + if new_value[0] is self.src_mention._entity: + _error("Bridging cannot self-reference the same entity: " + new_value[0].eid, self.strict) self._data[key] = BridgingLink(new_value[0], new_value[1]) def __delitem__(self, key): del self._data[key] def insert(self, key, new_value): - if new_value[0] is self.src_mention._cluster: - _error("Bridging cannot self-reference the same cluster: " + new_value[0].cluster_id, self.strict) + if new_value[0] is self.src_mention._entity: + _error("Bridging cannot self-reference the same entity: " + new_value[0].eid, self.strict) self._data.insert(key, BridgingLink(new_value[0], new_value[1])) def __str__(self): # TODO in future link.relation should never be None, 0 nor "_", so we could delete the below. - return ','.join(f'{l.target.eid_or_grp}<{self.src_mention.cluster.eid_or_grp}{":" + l.relation if l.relation not in (None, "_", "") else ""}' for l in sorted(self._data)) + return ','.join(f'{l.target.eid_or_grp}<{self.src_mention.entity.eid_or_grp}{":" + l.relation if l.relation not in (None, "_", "") else ""}' for l in sorted(self._data)) def __call__(self, relations_re=None): """Return a subset of links contained in this list as specified by the args. @@ -466,14 +466,14 @@ def __call__(self, relations_re=None): @property def targets(self): - """Return a list of the target clusters (without relations).""" + """Return a list of the target entities (without relations).""" return [link.target for link in self._data] def _delete_targets_without_mentions(self, warn=True): for link in self._data: if not link.target.mentions: if warn: - logging.warning(f"Cluster {link.target.cluster_id} has no mentions, but is referred to in bridging of {self.src_mention.cluster.cluster_id}") + logging.warning(f"Entity {link.target.eid} has no mentions, but is referred to in bridging of {self.src_mention.entity.eid}") self._data.remove(link) @@ -492,7 +492,7 @@ def _error(msg, strict): def load_coref_from_misc(doc, strict=True): global highest_doc_n - clusters = {} + entities = {} unfinished_mentions = collections.defaultdict(list) discontinuous_mentions = collections.defaultdict(list) global_entity = doc.meta.get('global.Entity') @@ -573,12 +573,12 @@ def load_coref_from_misc(doc, strict=True): try: mention.head = mention.words[head_idx - 1] except IndexError as err: - _error(f"Invalid head_idx={head_idx} for {mention.cluster.cluster_id} " + _error(f"Invalid head_idx={head_idx} for {mention.entity.eid} " f"closed at {node} with words={mention.words}", 1) if subspan_idx and subspan_idx == total_subspans: m = discontinuous_mentions[eid].pop() if m is not mention: - _error(f"Closing mention {mention.cluster.cluster_id} at {node}, but it has unfinished nested mentions ({m.words})", 1) + _error(f"Closing mention {mention.entity.eid} at {node}, but it has unfinished nested mentions ({m.words})", 1) # 3. opening or single-word else: @@ -615,18 +615,18 @@ def load_coref_from_misc(doc, strict=True): else: eid, subspan_idx, total_subspans = m.group(1, 2, 3) - cluster = clusters.get(eid) - if cluster is None: + entity = entities.get(eid) + if entity is None: if subspan_idx and subspan_idx != '1': _error(f'Non-first subspan of a discontinuous mention {eid} at {node} does not have any previous mention.', 1) - cluster = CorefCluster(eid) - clusters[eid] = cluster - cluster.cluster_type = etype - elif etype and cluster.cluster_type and cluster.cluster_type != etype: - logging.warning(f"etype mismatch in {node}: {cluster.cluster_type} != {etype}") - # CorefCluster could be created first with "Bridge=" without any type - elif etype and cluster.cluster_type is None: - cluster.cluster_type = etype + entity = CorefEntity(eid) + entities[eid] = entity + entity.etype = etype + elif etype and entity.etype and entity.etype != etype: + logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}") + # CorefEntity could be created first with "Bridge=" without any type + elif etype and entity.etype is None: + entity.etype = etype if subspan_idx and subspan_idx != '1': opened = [pair[0] for pair in unfinished_mentions[eid]] @@ -635,14 +635,14 @@ def load_coref_from_misc(doc, strict=True): if closing and subspan_idx == total_subspans: m = discontinuous_mentions[eid].pop() if m is not mention: - _error(f"{node}: closing mention {mention.cluster.cluster_id} ({mention.words}), but it has an unfinished nested mention ({m.words})", 1) + _error(f"{node}: closing mention {mention.entity.eid} ({mention.words}), but it has an unfinished nested mention ({m.words})", 1) try: mention.head = mention._words[head_idx - 1] except IndexError as err: - _error(f"Invalid head_idx={head_idx} for {mention.cluster.cluster_id} " + _error(f"Invalid head_idx={head_idx} for {mention.entity.eid} " f"closed at {node} with words={mention._words}", 1) else: - mention = CorefMention(words=[node], cluster=cluster) + mention = CorefMention(words=[node], entity=entity) if other: mention._other = other if subspan_idx: @@ -657,7 +657,7 @@ def load_coref_from_misc(doc, strict=True): # or with relations Bridge=e173 Date: Wed, 6 Apr 2022 05:50:19 +0200 Subject: [PATCH 137/871] preparing PyPI release 0.3.0 --- CHANGES.txt | 5 +++++ setup.cfg | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index 77d72548..67ced748 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,11 @@ Udapi Change Log ---------------- See https://github.com/udapi/udapi-python/commits/master for details. +0.3.0 2022-04-06 + - support for CorefUD 1.0 (new CoNLL-U format for coreference annotation) + - edits by Dan Zeman in block.ud.* + - Circle-CI (instead of Travis-CI) + 0.2.3 2021-02-23 - support for enhanced dependencies and coreference - requires Python 3.6+ due to f-strings diff --git a/setup.cfg b/setup.cfg index 4e96f81a..a14145ab 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = udapi -version = 0.2.3 +version = 0.3.0 author = Martin Popel author_email = popel@ufal.mff.cuni.cz description = Python framework for processing Universal Dependencies data From 36aae5882fe29a435721ea261f1156437c8e0e0c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 6 Apr 2022 10:22:32 +0200 Subject: [PATCH 138/871] MentionMisc in the OldCorefUD format should have been comma-separated, not space-separated We still use the old format during the CorefUD 1.0 conversion. Ideally, we should rewrite all the import scripts and get rid of the intermediate old-format step. --- udapi/block/corefud/concatmentionmisc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/corefud/concatmentionmisc.py b/udapi/block/corefud/concatmentionmisc.py index aeb945a8..74483368 100644 --- a/udapi/block/corefud/concatmentionmisc.py +++ b/udapi/block/corefud/concatmentionmisc.py @@ -14,11 +14,11 @@ def process_tree(self,root): index = matchObj.group(2) finalattr = 'MentionMisc'+index - value = node.misc[attrname] - + value = node.misc[attrname].replace(",", "%2C") + if finalattr not in node.misc: node.misc[finalattr] = f'{innerattrib}:{value}' else: - node.misc[finalattr] += f' {innerattrib}:{value}' + node.misc[finalattr] += f',{innerattrib}:{value}' del node.misc[attrname] From d57eb3b8b45b1e89a4b9b95ab264ce917357b5f6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 7 Apr 2022 16:13:35 +0200 Subject: [PATCH 139/871] =?UTF-8?q?"=D0=BF=D0=BB=D1=8E=D1=81".?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 79669d63..469c9173 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -17,6 +17,7 @@ class FixEdeprels(Block): 'если': [], 'как': ['как_только'], 'нежели': [], + 'плюс': [], 'раз': [], 'словно': [], 'так_что': [], From 73b9f54b4deb39945ebb04c9b9d83dc677fea3f9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 7 Apr 2022 21:10:50 +0200 Subject: [PATCH 140/871] Russian case markers. --- udapi/block/ud/ru/fixedeprels.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 469c9173..e5bab63b 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -21,6 +21,7 @@ class FixEdeprels(Block): 'раз': [], 'словно': [], 'так_что': [], + 'хоть': [], 'чем': [] } @@ -40,19 +41,25 @@ class FixEdeprels(Block): 'возле': 'возле:gen', 'вплоть_до': 'вплоть_до:gen', 'до': 'до:gen', + 'из': 'из:gen', 'к': 'к:dat', 'несмотря_на': 'несмотря_на:acc', + 'около': 'около:gen', 'относительно': 'относительно:gen', 'по_мера': 'по_мере:gen', + 'по_мера_то_как': 'по_мере_того_как', 'по_отношение_ко?': 'по_отношению_к:dat', 'по_повод': 'по_поводу:gen', 'по_сравнение_с': 'по_сравнению_с:ins', 'помимо': 'помимо:gen', + 'порядка': 'порядка:gen', 'при_помощь': 'при_помощи:gen', 'с_помощь': 'с_помощью:gen', + 'с_тот_пора_как': 'с_тех_пор_как', 'со_сторона': 'со_стороны:gen', 'согласно': 'согласно:dat', 'спустя': 'спустя:acc', + 'у': 'у:gen', 'через': 'через:acc' } @@ -126,6 +133,15 @@ def process_node(self, node): # Accusative or instrumental are possible. Pick accusative. edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' continue + m = re.match(r'^(obl(?::arg)?|nmod):(по)(?::(?:nom|gen|voc|ins))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Dative, accusative or locative are possible. Pick dative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':dat' + continue m = re.match(r'^(obl(?::arg)?|nmod):(с)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) From a9b3a82baaf6283c4d47a13d4326c9f7807bccc6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 9 Apr 2022 21:35:57 +0200 Subject: [PATCH 141/871] Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index e5bab63b..c7293d69 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -22,7 +22,8 @@ class FixEdeprels(Block): 'словно': [], 'так_что': [], 'хоть': [], - 'чем': [] + 'чем': [], + 'что': [] } # Secondary prepositions sometimes have the lemma of the original part of @@ -35,16 +36,21 @@ class FixEdeprels(Block): 'в_качество': 'в_качестве:gen', 'в_отношение': 'в_отношении:gen', 'в_связь_с': 'в_связи_с:ins', + 'в_случай_если': 'в_случае_если', 'в_соответствие_с': 'в_соответствии_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', 'возле': 'возле:gen', 'вплоть_до': 'вплоть_до:gen', + 'вроде': 'вроде:gen', + 'для': 'для:gen', 'до': 'до:gen', + 'за_исключение': 'за_исключением:gen', 'из': 'из:gen', 'к': 'к:dat', 'несмотря_на': 'несмотря_на:acc', 'около': 'около:gen', + 'от': 'от:gen', 'относительно': 'относительно:gen', 'по_мера': 'по_мере:gen', 'по_мера_то_как': 'по_мере_того_как', @@ -87,6 +93,10 @@ def process_node(self, node): if m: bdeprel = m.group(1) solved = False + # If the case marker starts with 'столько', remove this part. + # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. + # Similarly, 'то' occurs in 'то...то' and should be removed. + edep['deprel'] = re.sub(r':(столько|то|точно)[_:]', ':', edep['deprel']) # If one of the following expressions occurs followed by another preposition # or by morphological case, remove the additional case marking. For example, # 'словно_у' becomes just 'словно'. @@ -124,7 +134,7 @@ def process_node(self, node): # Accusative or locative are possible. Pick locative. edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' continue - m = re.match(r'^(obl(?::arg)?|nmod):(за)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod):(за|между|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: From 4f2eb09306121425ece1dc0717f92f1f8a5d4a74 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 10 Apr 2022 09:06:21 +0200 Subject: [PATCH 142/871] Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index c7293d69..0fc90641 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -40,11 +40,14 @@ class FixEdeprels(Block): 'в_соответствие_с': 'в_соответствии_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', + 'во_глава': 'во_главе_с:ins', + 'во_глава_с': 'во_главе_с:ins', 'возле': 'возле:gen', 'вплоть_до': 'вплоть_до:gen', 'вроде': 'вроде:gen', 'для': 'для:gen', 'до': 'до:gen', + 'до_то_как': 'до:gen', # до того, как ... 'за_исключение': 'за_исключением:gen', 'из': 'из:gen', 'к': 'к:dat', @@ -62,6 +65,7 @@ class FixEdeprels(Block): 'при_помощь': 'при_помощи:gen', 'с_помощь': 'с_помощью:gen', 'с_тот_пора_как': 'с_тех_пор_как', + 'свыше': 'свыше:gen', 'со_сторона': 'со_стороны:gen', 'согласно': 'согласно:dat', 'спустя': 'спустя:acc', @@ -125,7 +129,7 @@ def process_node(self, node): # Both "на" and "в" also occur with genitive. However, this # is only because there are numerals in the phrase ("в 9 случаев из 10") # and the whole phrase should not be analyzed as genitive. - m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod):(в|на|о)(?::(?:nom|gen|dat|voc|ins))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: From 3f399af0e1b88b3eb3e63f1496f6c52e478cf15f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 11 Apr 2022 10:29:14 +0200 Subject: [PATCH 143/871] Fixed Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 0fc90641..fd24be5a 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -18,6 +18,7 @@ class FixEdeprels(Block): 'как': ['как_только'], 'нежели': [], 'плюс': [], + 'пусть': [], 'раз': [], 'словно': [], 'так_что': [], @@ -62,6 +63,7 @@ class FixEdeprels(Block): 'по_сравнение_с': 'по_сравнению_с:ins', 'помимо': 'помимо:gen', 'порядка': 'порядка:gen', + 'после': 'после:gen', 'при_помощь': 'при_помощи:gen', 'с_помощь': 'с_помощью:gen', 'с_тот_пора_как': 'с_тех_пор_как', @@ -70,7 +72,8 @@ class FixEdeprels(Block): 'согласно': 'согласно:dat', 'спустя': 'спустя:acc', 'у': 'у:gen', - 'через': 'через:acc' + 'через': 'через:acc', + 'чтоб': 'чтобы' } def copy_case_from_adposition(self, node, adposition): @@ -138,7 +141,7 @@ def process_node(self, node): # Accusative or locative are possible. Pick locative. edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' continue - m = re.match(r'^(obl(?::arg)?|nmod):(за|между|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod):(за|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: @@ -147,6 +150,15 @@ def process_node(self, node): # Accusative or instrumental are possible. Pick accusative. edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' continue + m = re.match(r'^(obl(?::arg)?|nmod):(между)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or instrumental are possible. Pick genitive. + edep['deprel'] = m.group(1)+':'+m.group(2)+':gen' + continue m = re.match(r'^(obl(?::arg)?|nmod):(по)(?::(?:nom|gen|voc|ins))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) From 8d0caaea2a7c491ee5787d090e612822da0d2295 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 11 Apr 2022 22:57:47 +0200 Subject: [PATCH 144/871] Fixed Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index fd24be5a..4e4892e4 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -12,19 +12,20 @@ class FixEdeprels(Block): # by all the inner cases. # The list in the value contains exceptions that should be left intact. outermost = { - 'будто': [], - 'ведь': [], - 'если': [], - 'как': ['как_только'], - 'нежели': [], - 'плюс': [], - 'пусть': [], - 'раз': [], - 'словно': [], - 'так_что': [], - 'хоть': [], - 'чем': [], - 'что': [] + 'более_чем': [], + 'будто': [], + 'ведь': [], + 'если': [], + 'как': ['как_только'], + 'нежели': [], + 'плюс': [], + 'пусть': [], + 'раз': [], + 'словно': [], + 'так_что': [], + 'хоть': [], + 'чем': [], + 'что': [] } # Secondary prepositions sometimes have the lemma of the original part of @@ -33,6 +34,7 @@ class FixEdeprels(Block): # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { + 'loc': 'в:loc', 'в_вид': 'в_виде:gen', 'в_качество': 'в_качестве:gen', 'в_отношение': 'в_отношении:gen', @@ -40,9 +42,11 @@ class FixEdeprels(Block): 'в_случай_если': 'в_случае_если', 'в_соответствие_с': 'в_соответствии_с:ins', 'в_течение': 'в_течение:gen', + 'в_тот_время_как': 'в_то_время_как', 'в_ход': 'в_ходе:gen', 'во_глава': 'во_главе_с:ins', 'во_глава_с': 'во_главе_с:ins', + 'во_избежание': 'во_избежание:gen', 'возле': 'возле:gen', 'вплоть_до': 'вплоть_до:gen', 'вроде': 'вроде:gen', @@ -65,6 +69,8 @@ class FixEdeprels(Block): 'порядка': 'порядка:gen', 'после': 'после:gen', 'при_помощь': 'при_помощи:gen', + 'при_условие_что': 'при_условии_что', + 'против': 'против:gen', 'с_помощь': 'с_помощью:gen', 'с_тот_пора_как': 'с_тех_пор_как', 'свыше': 'свыше:gen', From fdaa4480ac29eba170ba66f8bc6916f1805044bc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 12 Apr 2022 12:25:38 +0200 Subject: [PATCH 145/871] Fixed Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 4e4892e4..b243ed0b 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -24,6 +24,7 @@ class FixEdeprels(Block): 'словно': [], 'так_что': [], 'хоть': [], + 'хотя': [], 'чем': [], 'что': [] } @@ -44,19 +45,23 @@ class FixEdeprels(Block): 'в_течение': 'в_течение:gen', 'в_тот_время_как': 'в_то_время_как', 'в_ход': 'в_ходе:gen', + 'вместо': 'вместо:gen', 'во_глава': 'во_главе_с:ins', 'во_глава_с': 'во_главе_с:ins', 'во_избежание': 'во_избежание:gen', 'возле': 'возле:gen', 'вплоть_до': 'вплоть_до:gen', 'вроде': 'вроде:gen', + 'выше': 'выше:gen', 'для': 'для:gen', 'до': 'до:gen', 'до_то_как': 'до:gen', # до того, как ... 'за_исключение': 'за_исключением:gen', 'из': 'из:gen', 'к': 'к:dat', + 'ко': 'ко:dat', 'несмотря_на': 'несмотря_на:acc', + 'ниже': 'ниже:gen', 'около': 'около:gen', 'от': 'от:gen', 'относительно': 'относительно:gen', @@ -106,6 +111,10 @@ def process_node(self, node): if m: bdeprel = m.group(1) solved = False + # If the marker is 'быть', discard it. It represents the phrase 'то есть', which should not be analyzed as introducing a subordinate clause. + edep['deprel'] = re.sub(r':быть.*', '', edep['deprel']) + # Some markers should be discarded only if they occur as clause markers (acl, advcl). + edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. # Similarly, 'то' occurs in 'то...то' and should be removed. From 0bc4776db98d580fe41e8a01d167726018761510 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 12 Apr 2022 15:02:00 +0200 Subject: [PATCH 146/871] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index b243ed0b..f3d9c4ea 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -190,7 +190,7 @@ def process_node(self, node): edep['deprel'] = m.group(1)+':'+adpcase else: # Genitive or instrumental are possible. Pick instrumental. - edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' + edep['deprel'] = m.group(1)+':'+m.group(2)+':ins' continue if re.match(r'^(nmod|obl):', edep['deprel']): if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': From 96e7265547073a4f2bce83699afd1b45ebab45de Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 12 Apr 2022 21:16:22 +0200 Subject: [PATCH 147/871] =?UTF-8?q?"=D1=81=D1=82=D0=BE=D0=BB=D1=8C=D0=BA?= =?UTF-8?q?=D0=BE"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index f3d9c4ea..3ed1d91e 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -112,7 +112,7 @@ def process_node(self, node): bdeprel = m.group(1) solved = False # If the marker is 'быть', discard it. It represents the phrase 'то есть', which should not be analyzed as introducing a subordinate clause. - edep['deprel'] = re.sub(r':быть.*', '', edep['deprel']) + edep['deprel'] = re.sub(r':(быть|столько).*', '', edep['deprel']) # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # If the case marker starts with 'столько', remove this part. From 5c0271d2e0cd204e6a3ac1a45eef30c2c82a7a31 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 13 Apr 2022 08:31:25 +0200 Subject: [PATCH 148/871] Fixed Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 41 +++++++++++++++++++------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 3ed1d91e..bdd4aa9b 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -12,21 +12,23 @@ class FixEdeprels(Block): # by all the inner cases. # The list in the value contains exceptions that should be left intact. outermost = { - 'более_чем': [], - 'будто': [], - 'ведь': [], - 'если': [], - 'как': ['как_только'], - 'нежели': [], - 'плюс': [], - 'пусть': [], - 'раз': [], - 'словно': [], - 'так_что': [], - 'хоть': [], - 'хотя': [], - 'чем': [], - 'что': [] + 'более_чем': [], + 'будто': [], + 'ведь': [], + 'если': [], + 'как': ['как_только'], + 'когда': [], + 'нежели': [], + 'плюс': [], + 'потому_что': [], + 'пусть': [], + 'раз': [], + 'словно': [], + 'так_что': [], + 'хоть': [], + 'хотя': [], + 'чем': [], + 'что': [] } # Secondary prepositions sometimes have the lemma of the original part of @@ -45,11 +47,13 @@ class FixEdeprels(Block): 'в_течение': 'в_течение:gen', 'в_тот_время_как': 'в_то_время_как', 'в_ход': 'в_ходе:gen', + 'вблизи': 'вблизи:gen', 'вместо': 'вместо:gen', 'во_глава': 'во_главе_с:ins', 'во_глава_с': 'во_главе_с:ins', 'во_избежание': 'во_избежание:gen', 'возле': 'возле:gen', + 'вокруг': 'вокруг:gen', 'вплоть_до': 'вплоть_до:gen', 'вроде': 'вроде:gen', 'выше': 'выше:gen', @@ -60,6 +64,7 @@ class FixEdeprels(Block): 'из': 'из:gen', 'к': 'к:dat', 'ко': 'ко:dat', + 'кроме': 'кроме:gen', 'несмотря_на': 'несмотря_на:acc', 'ниже': 'ниже:gen', 'около': 'около:gen', @@ -73,8 +78,10 @@ class FixEdeprels(Block): 'помимо': 'помимо:gen', 'порядка': 'порядка:gen', 'после': 'после:gen', + 'при': 'при:loc', 'при_помощь': 'при_помощи:gen', 'при_условие_что': 'при_условии_что', + 'про': 'про:acc', 'против': 'против:gen', 'с_помощь': 'с_помощью:gen', 'с_тот_пора_как': 'с_тех_пор_как', @@ -147,7 +154,7 @@ def process_node(self, node): # Both "на" and "в" also occur with genitive. However, this # is only because there are numerals in the phrase ("в 9 случаев из 10") # and the whole phrase should not be analyzed as genitive. - m = re.match(r'^(obl(?::arg)?|nmod):(в|на|о)(?::(?:nom|gen|dat|voc|ins))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod):(в|во|на|о)(?::(?:nom|gen|dat|voc|ins))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: @@ -156,7 +163,7 @@ def process_node(self, node): # Accusative or locative are possible. Pick locative. edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' continue - m = re.match(r'^(obl(?::arg)?|nmod):(за|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod):(за|над|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: From bde48872deeeae30b9f797974e30eb67211f9dbc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 13 Apr 2022 08:45:05 +0200 Subject: [PATCH 149/871] =?UTF-8?q?Russian=20"=D0=BD=D0=B0=D0=B4"=20does?= =?UTF-8?q?=20not=20seem=20to=20allow=20accusative.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index bdd4aa9b..887b5e58 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -65,6 +65,7 @@ class FixEdeprels(Block): 'к': 'к:dat', 'ко': 'ко:dat', 'кроме': 'кроме:gen', + 'над': 'над:ins', # at least I have not encountered any genuine example of accusative 'несмотря_на': 'несмотря_на:acc', 'ниже': 'ниже:gen', 'около': 'около:gen', @@ -163,7 +164,8 @@ def process_node(self, node): # Accusative or locative are possible. Pick locative. edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' continue - m = re.match(r'^(obl(?::arg)?|nmod):(за|над|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + # Unlike in Czech, 'над' seems to allow only instrumental and not accusative. + m = re.match(r'^(obl(?::arg)?|nmod):(за|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: From 322695329bd8426045bae49298a00b9e0179fe66 Mon Sep 17 00:00:00 2001 From: Mehmet Oguz Derin Date: Tue, 19 Apr 2022 14:24:28 +0300 Subject: [PATCH 150/871] Fix a tiny typo (#105) fix documentation of shift_before_node --- udapi/core/node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 732ef7f4..ad36aa0a 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -622,7 +622,7 @@ def shift_after_node(self, reference_node, without_children=False, skip_if_desce self._shift_before_ord(reference_node._ord + 1, without_children=without_children) def shift_before_node(self, reference_node, without_children=False, skip_if_descendant=False): - """Shift this node after the reference_node.""" + """Shift this node before the reference_node.""" if not without_children and reference_node.is_descendant_of(self): if skip_if_descendant: return From 87003818c9f906415a402d1f67caf601f6c5380c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 20 Apr 2022 21:19:32 +0200 Subject: [PATCH 151/871] Added a block to fix spurious Spanish auxiliary "tener que". --- udapi/block/ud/es/fixtenerque.py | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 udapi/block/ud/es/fixtenerque.py diff --git a/udapi/block/ud/es/fixtenerque.py b/udapi/block/ud/es/fixtenerque.py new file mode 100644 index 00000000..bce0b731 --- /dev/null +++ b/udapi/block/ud/es/fixtenerque.py @@ -0,0 +1,33 @@ +"""Block to fix spurious auxiliary verbs in UD Spanish-AnCora.""" +from udapi.core.block import Block +import logging +import re + +class FixTenerQue(Block): + + def process_node(self, node): + """ + Some Spanish treebanks treat the verb 'tener' in constructions such as + 'tener que comer' as auxiliary. This is wrong and the validator will + flag it as an error. This block fixes such annotations. + """ + if node.lemma == 'tener' and node.upos == 'AUX': + node.upos = 'VERB' + # In rare cases the auxiliary may have been promoted due to ellipsis. + # Most of the time however, it is attached as 'aux' to the main verb. + if node.udeprel == 'aux': + mainverb = node.parent + node.parent = mainverb.parent + node.deprel = mainverb.deprel + mainverb.parent = node + mainverb.deprel = 'xcomp' + # Some children of the former main verb should be reattached to 'tener'. + # Others (especially a direct object) should stay with the former main verb. + for c in mainverb.children: + if not re.match(r'^(obj|iobj|obl|conj|list|flat|fixed|goeswith|reparandum)$', c.udeprel): + c.parent = node + # On the other hand, the conjunction 'que' may have been wrongly attached as 'fixed' to 'tener'. + for c in node.children: + if c.form.lower() eq 'que' and c.ord > node.ord and c.ord < mainverb.ord: + c.parent = mainverb + c.deprel = 'mark' From 41e22c90061454851e8170b7f2b7217dda1b95aa Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 20 Apr 2022 21:22:21 +0200 Subject: [PATCH 152/871] Bug fix. --- udapi/block/ud/es/fixtenerque.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/es/fixtenerque.py b/udapi/block/ud/es/fixtenerque.py index bce0b731..ba6691c2 100644 --- a/udapi/block/ud/es/fixtenerque.py +++ b/udapi/block/ud/es/fixtenerque.py @@ -28,6 +28,6 @@ def process_node(self, node): c.parent = node # On the other hand, the conjunction 'que' may have been wrongly attached as 'fixed' to 'tener'. for c in node.children: - if c.form.lower() eq 'que' and c.ord > node.ord and c.ord < mainverb.ord: + if c.form.lower() == 'que' and c.ord > node.ord and c.ord < mainverb.ord: c.parent = mainverb c.deprel = 'mark' From 738980ec8820a6429235243eed2266cf27e17089 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 20 Apr 2022 21:38:13 +0200 Subject: [PATCH 153/871] Refined the reattachment so that it is mirrored in the enhanced graph. --- udapi/block/ud/es/fixtenerque.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/udapi/block/ud/es/fixtenerque.py b/udapi/block/ud/es/fixtenerque.py index ba6691c2..f287051c 100644 --- a/udapi/block/ud/es/fixtenerque.py +++ b/udapi/block/ud/es/fixtenerque.py @@ -17,17 +17,29 @@ def process_node(self, node): # Most of the time however, it is attached as 'aux' to the main verb. if node.udeprel == 'aux': mainverb = node.parent - node.parent = mainverb.parent - node.deprel = mainverb.deprel - mainverb.parent = node - mainverb.deprel = 'xcomp' + self.reattach(node, mainverb.parent, mainverb.deprel) + self.reattach(mainverb, node, 'xcomp') # Some children of the former main verb should be reattached to 'tener'. # Others (especially a direct object) should stay with the former main verb. for c in mainverb.children: if not re.match(r'^(obj|iobj|obl|conj|list|flat|fixed|goeswith|reparandum)$', c.udeprel): - c.parent = node + self.reattach(c, node, c.deprel) # On the other hand, the conjunction 'que' may have been wrongly attached as 'fixed' to 'tener'. for c in node.children: if c.form.lower() == 'que' and c.ord > node.ord and c.ord < mainverb.ord: - c.parent = mainverb - c.deprel = 'mark' + self.reattach(c, mainverb, 'mark') + + def reattach(self, node, parent, deprel): + """ + Changes the incoming dependency relation to a node. Makes sure that the + same change is done in the basic tree and in the enhanced graph. + """ + if node.deps: + # If the enhanced graph contains the current basic relation, remove it. + orig_n_deps = len(node.deps) + node.deps = [x for x in node.deps if x['parent'] != node.parent or re.sub(r':.*', '', x['deprel']) != node.udeprel] + # Add the new basic relation to the enhanced graph only if the original one was there. + if len(node.deps) < orig_n_deps: + node.deps.append({'parent': parent, 'deprel': deprel}) + node.parent = parent + node.deprel = deprel From d2212f7854efad2337d11a94019bb97cd627d73c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 20 Apr 2022 22:06:32 +0200 Subject: [PATCH 154/871] Fix: "tener que" and "ir a" both analyzed as xcomp rather than aux. --- udapi/block/ud/es/fixtenerque.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/es/fixtenerque.py b/udapi/block/ud/es/fixtenerque.py index f287051c..5fc4c11e 100644 --- a/udapi/block/ud/es/fixtenerque.py +++ b/udapi/block/ud/es/fixtenerque.py @@ -10,8 +10,10 @@ def process_node(self, node): Some Spanish treebanks treat the verb 'tener' in constructions such as 'tener que comer' as auxiliary. This is wrong and the validator will flag it as an error. This block fixes such annotations. + + EDIT: 'ir a comer' is processed the same way. """ - if node.lemma == 'tener' and node.upos == 'AUX': + if re.match(r'^(tener|ir)$', node.lemma) and node.upos == 'AUX': node.upos = 'VERB' # In rare cases the auxiliary may have been promoted due to ellipsis. # Most of the time however, it is attached as 'aux' to the main verb. @@ -26,7 +28,7 @@ def process_node(self, node): self.reattach(c, node, c.deprel) # On the other hand, the conjunction 'que' may have been wrongly attached as 'fixed' to 'tener'. for c in node.children: - if c.form.lower() == 'que' and c.ord > node.ord and c.ord < mainverb.ord: + if re.match(r'^(que|a)$', c.form.lower()) and c.ord > node.ord and c.ord < mainverb.ord: self.reattach(c, mainverb, 'mark') def reattach(self, node, parent, deprel): From 3ed1fecf434a0dc19c0777195d9b2b020d1d8d13 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 20 Apr 2022 23:16:42 +0200 Subject: [PATCH 155/871] Refined list of deprels of children that should not be re-attached. --- udapi/block/ud/es/fixtenerque.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/es/fixtenerque.py b/udapi/block/ud/es/fixtenerque.py index 5fc4c11e..62fa0f4d 100644 --- a/udapi/block/ud/es/fixtenerque.py +++ b/udapi/block/ud/es/fixtenerque.py @@ -24,7 +24,7 @@ def process_node(self, node): # Some children of the former main verb should be reattached to 'tener'. # Others (especially a direct object) should stay with the former main verb. for c in mainverb.children: - if not re.match(r'^(obj|iobj|obl|conj|list|flat|fixed|goeswith|reparandum)$', c.udeprel): + if not re.match(r'^(obj|iobj|obl|ccomp|xcomp|conj|list|compound|flat|fixed|goeswith|reparandum)$', c.udeprel): self.reattach(c, node, c.deprel) # On the other hand, the conjunction 'que' may have been wrongly attached as 'fixed' to 'tener'. for c in node.children: From 74c0a91d1f2a3b3a7d56f54a33b98658831eff70 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 21 Apr 2022 14:11:09 +0200 Subject: [PATCH 156/871] A new block to fill out obvious lemma candidates, based on features! --- udapi/block/ud/lemmatize.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 udapi/block/ud/lemmatize.py diff --git a/udapi/block/ud/lemmatize.py b/udapi/block/ud/lemmatize.py new file mode 100644 index 00000000..49aa5fbf --- /dev/null +++ b/udapi/block/ud/lemmatize.py @@ -0,0 +1,36 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + def process_node(self, node): + """ + Some treebanks lack lemmas for some or all words. Occasionally we may be + able to guess that the lemma is identical to the word form. This block + will then fill out the lemma. + + For some parts of speech, we can only say that the form is the lemma if + we have morphological features that will confirm it is the right form. + """ + if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': + # Many closed classes do not inflect and have the same lemma as the form (just lowercased). + if re.match(r'^(PUNCT|SYM|ADV|ADP|CCONJ|SCONJ|PART|INTJ|X)$', node.upos): + node.lemma = node.form.lower() + # NOUN PROPN ADJ PRON DET NUM VERB AUX + # VERB and AUX: use the infinitive + elif re.match(r'^(VERB|AUX)$', node.upos) and node.feats['VerbForm'] == 'Inf': + node.lemma = node.form.lower() + # NOUN and PROPN: use singular nominative (but do not lowercase for PROPN) + # Note: This rule is wrong in German, where no nouns should be lowercased. + elif re.match(r'^(NOUN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): + node.lemma = node.form.lower() + elif re.match(r'^(PROPN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): + node.lemma = node.form + # ADJ, PRON, DET: use masculine singular nominative (pronouns: each person has its own lemma) + elif re.match(r'^(ADJ|PRON|DET)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): + node.lemma = node.form.lower() + # NUM: use masculine nominative (number, if present at all, is lexical) + elif re.match(r'^(NUM)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Nom)?$', node.feats['Case']): + node.lemma = node.form.lower() From b0b483d7fb699b6f1049ae796e697142288dbff5 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 21 Apr 2022 14:18:06 +0200 Subject: [PATCH 157/871] Polarity and degree. --- udapi/block/ud/lemmatize.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/lemmatize.py b/udapi/block/ud/lemmatize.py index 49aa5fbf..a234256f 100644 --- a/udapi/block/ud/lemmatize.py +++ b/udapi/block/ud/lemmatize.py @@ -16,18 +16,24 @@ def process_node(self, node): """ if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': # Many closed classes do not inflect and have the same lemma as the form (just lowercased). - if re.match(r'^(PUNCT|SYM|ADV|ADP|CCONJ|SCONJ|PART|INTJ|X)$', node.upos): + if re.match(r'^(PUNCT|SYM|ADP|CCONJ|SCONJ|PART|INTJ|X)$', node.upos): + node.lemma = node.form.lower() + # NOUN PROPN ADJ PRON DET NUM VERB AUX ADV + # ADV: use positive affirmative + elif re.match(r'^(ADV)$', node.upos) and re.match(r'^(Pos)?$', node.feats['Degree']) and re.match(r'^(Pos)?$', node.feats['Polarity']): node.lemma = node.form.lower() - # NOUN PROPN ADJ PRON DET NUM VERB AUX # VERB and AUX: use the infinitive - elif re.match(r'^(VERB|AUX)$', node.upos) and node.feats['VerbForm'] == 'Inf': + elif re.match(r'^(VERB|AUX)$', node.upos) and node.feats['VerbForm'] == 'Inf' and re.match(r'^(Pos)?$', node.feats['Polarity']): node.lemma = node.form.lower() # NOUN and PROPN: use singular nominative (but do not lowercase for PROPN) # Note: This rule is wrong in German, where no nouns should be lowercased. - elif re.match(r'^(NOUN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): + elif re.match(r'^(NOUN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Polarity']): node.lemma = node.form.lower() - elif re.match(r'^(PROPN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): + elif re.match(r'^(PROPN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Polarity']): node.lemma = node.form + # ADJ: use masculine singular nominative positive affirmative + elif re.match(r'^(ADJ)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Degree']) and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form.lower() # ADJ, PRON, DET: use masculine singular nominative (pronouns: each person has its own lemma) elif re.match(r'^(ADJ|PRON|DET)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): node.lemma = node.form.lower() From 6ee2c9b98a11785fe3c0ef9ab5c074cab4f4dfb6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 09:33:27 +0200 Subject: [PATCH 158/871] Added a block to fix features of infinitives in Spanish PUD. --- udapi/block/ud/es/fixverbfeats.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 udapi/block/ud/es/fixverbfeats.py diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py new file mode 100644 index 00000000..3972273a --- /dev/null +++ b/udapi/block/ud/es/fixverbfeats.py @@ -0,0 +1,19 @@ +"""Block to fix features (and potentially lemmas) of verbs in UD Spanish-PUD.""" +from udapi.core.block import Block +import logging +import re + +class FixVerbFeats(Block): + + def process_node(self, node): + """ + The features assigned to verbs in Spanish PUD are often wrong, although + the annotation was (reportedly) done manually. For example, infinitives + are tagged with VerbForm=Fin instead of VerbForm=Inf. + """ + if re.match(r'^(VERB|AUX)$', node.upos): + if re.search(r'[aei]r$', node.form, re.IGNORECASE): + # The infinitive has no features other than VerbForm. + node.feats = {} + node.feats['VerbForm'] = 'Inf' + node.lemma = node.form.lower() From 67334da56f852d1c15c03bad7ab679d3eb63879e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 11:38:34 +0200 Subject: [PATCH 159/871] Fix features of gerunds in Spanish. --- udapi/block/ud/es/fixverbfeats.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py index 3972273a..6c924319 100644 --- a/udapi/block/ud/es/fixverbfeats.py +++ b/udapi/block/ud/es/fixverbfeats.py @@ -17,3 +17,9 @@ def process_node(self, node): node.feats = {} node.feats['VerbForm'] = 'Inf' node.lemma = node.form.lower() + elif re.search(r'ndo$', node.form, re.IGNORECASE): + if node.form.lower() != 'entiendo': + # The gerund has no features other than VerbForm. + # The lemma is not always straightforward but we have fixed it manually. + node.feats = {} + node.feats['VerbForm'] = 'Ger' From fd0f74382c7779d0f22d5ad8b5aef341d91eb0dc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 12:02:23 +0200 Subject: [PATCH 160/871] Fix features of participles in Spanish. --- udapi/block/ud/es/fixverbfeats.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py index 6c924319..146105a2 100644 --- a/udapi/block/ud/es/fixverbfeats.py +++ b/udapi/block/ud/es/fixverbfeats.py @@ -23,3 +23,16 @@ def process_node(self, node): # The lemma is not always straightforward but we have fixed it manually. node.feats = {} node.feats['VerbForm'] = 'Ger' + elif re.search(r'(d|biert|dich|fech|hech|muert|puest|vist)[oa]s?$', node.form, re.IGNORECASE): + # The (past) participle has always Gender and Number. + # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). + # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) + gender = node.feats['Gender'] + number = node.feats['Number'] + node.feats = {} + node.feats['VerbForm'] = 'Part' + node.feats['Tense'] = 'Past' + node.feats['Gender'] = gender + node.feats['Number'] = number + if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): + node.lemma = re.sub(r'd[os]s?$', 'r', node.form.lower()) From 6259fd07eff9ceb90126ab2d3d6f57c0804b6f3a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 12:05:55 +0200 Subject: [PATCH 161/871] Guess gender and number if unknown. --- udapi/block/ud/es/fixverbfeats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py index 146105a2..3282a6eb 100644 --- a/udapi/block/ud/es/fixverbfeats.py +++ b/udapi/block/ud/es/fixverbfeats.py @@ -27,8 +27,8 @@ def process_node(self, node): # The (past) participle has always Gender and Number. # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) - gender = node.feats['Gender'] - number = node.feats['Number'] + gender = node.feats['Gender'] ? node.feats['Gender'] : re.search(r'os?$', node.form, re.IGNORECASE) ? 'Masc' : 'Fem' + number = node.feats['Number'] ? node.feats['Number'] : re.search(r's$', node.form, re.IGNORECASE) ? 'Plur' : 'Sing' node.feats = {} node.feats['VerbForm'] = 'Part' node.feats['Tense'] = 'Past' From c7ee815eace81fa82c5204023a5105485c847cd3 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 12:08:41 +0200 Subject: [PATCH 162/871] Fixed: Python syntax instead of Perl. --- udapi/block/ud/es/fixverbfeats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py index 3282a6eb..56d6587c 100644 --- a/udapi/block/ud/es/fixverbfeats.py +++ b/udapi/block/ud/es/fixverbfeats.py @@ -27,8 +27,8 @@ def process_node(self, node): # The (past) participle has always Gender and Number. # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) - gender = node.feats['Gender'] ? node.feats['Gender'] : re.search(r'os?$', node.form, re.IGNORECASE) ? 'Masc' : 'Fem' - number = node.feats['Number'] ? node.feats['Number'] : re.search(r's$', node.form, re.IGNORECASE) ? 'Plur' : 'Sing' + gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem') + number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing') node.feats = {} node.feats['VerbForm'] = 'Part' node.feats['Tense'] = 'Past' From 0644a490d3e749553855d1b0e6fe60199a33d78c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 12:10:11 +0200 Subject: [PATCH 163/871] Bug fix. --- udapi/block/ud/es/fixverbfeats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py index 56d6587c..6784afde 100644 --- a/udapi/block/ud/es/fixverbfeats.py +++ b/udapi/block/ud/es/fixverbfeats.py @@ -35,4 +35,4 @@ def process_node(self, node): node.feats['Gender'] = gender node.feats['Number'] = number if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): - node.lemma = re.sub(r'd[os]s?$', 'r', node.form.lower()) + node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower()) From e80b81131c7e3264f0c972bdf4428a3247b2e5e0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 12:12:48 +0200 Subject: [PATCH 164/871] Fixed: "da" is not a participle. --- udapi/block/ud/es/fixverbfeats.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py index 6784afde..d6e99aa7 100644 --- a/udapi/block/ud/es/fixverbfeats.py +++ b/udapi/block/ud/es/fixverbfeats.py @@ -24,15 +24,16 @@ def process_node(self, node): node.feats = {} node.feats['VerbForm'] = 'Ger' elif re.search(r'(d|biert|dich|fech|hech|muert|puest|vist)[oa]s?$', node.form, re.IGNORECASE): - # The (past) participle has always Gender and Number. - # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). - # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) - gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem') - number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing') - node.feats = {} - node.feats['VerbForm'] = 'Part' - node.feats['Tense'] = 'Past' - node.feats['Gender'] = gender - node.feats['Number'] = number - if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): - node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower()) + if node.form.lower() != 'da': + # The (past) participle has always Gender and Number. + # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). + # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) + gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem') + number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing') + node.feats = {} + node.feats['VerbForm'] = 'Part' + node.feats['Tense'] = 'Past' + node.feats['Gender'] = gender + node.feats['Number'] = number + if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): + node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower()) From e7dbd6439d765e86322fe106d22d63ee1c3527b8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 12:18:44 +0200 Subject: [PATCH 165/871] It must be -ado/-ido (to prevent catching puedo, pudo, ayuda, inunda...) --- udapi/block/ud/es/fixverbfeats.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py index d6e99aa7..643ecd7c 100644 --- a/udapi/block/ud/es/fixverbfeats.py +++ b/udapi/block/ud/es/fixverbfeats.py @@ -23,17 +23,16 @@ def process_node(self, node): # The lemma is not always straightforward but we have fixed it manually. node.feats = {} node.feats['VerbForm'] = 'Ger' - elif re.search(r'(d|biert|dich|fech|hech|muert|puest|vist)[oa]s?$', node.form, re.IGNORECASE): - if node.form.lower() != 'da': - # The (past) participle has always Gender and Number. - # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). - # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) - gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem') - number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing') - node.feats = {} - node.feats['VerbForm'] = 'Part' - node.feats['Tense'] = 'Past' - node.feats['Gender'] = gender - node.feats['Number'] = number - if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): - node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower()) + elif re.search(r'([ai]d|biert|dich|fech|hech|muert|puest|vist)[oa]s?$', node.form, re.IGNORECASE): + # The (past) participle has always Gender and Number. + # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). + # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) + gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem') + number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing') + node.feats = {} + node.feats['VerbForm'] = 'Part' + node.feats['Tense'] = 'Past' + node.feats['Gender'] = gender + node.feats['Number'] = number + if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): + node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower()) From de8021fe2616b0eed5f3a425fc51eae57da34483 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 24 Apr 2022 14:26:58 +0200 Subject: [PATCH 166/871] =?UTF-8?q?"=D1=80=D0=B0=D0=B2=D0=BD=D0=BE=5F?= =?UTF-8?q?=D0=BA=D0=B0=D0=BA"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 887b5e58..bff1a677 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -22,6 +22,7 @@ class FixEdeprels(Block): 'плюс': [], 'потому_что': [], 'пусть': [], + 'равно_как': [], 'раз': [], 'словно': [], 'так_что': [], From 7a2d08f84b8af05b9b40ad92d8b70ca9bd885411 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 24 Apr 2022 17:04:33 +0200 Subject: [PATCH 167/871] Fixed some Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index bff1a677..ebf8d213 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -18,6 +18,7 @@ class FixEdeprels(Block): 'если': [], 'как': ['как_только'], 'когда': [], + 'минус': [], 'нежели': [], 'плюс': [], 'потому_что': [], @@ -42,11 +43,13 @@ class FixEdeprels(Block): 'в_вид': 'в_виде:gen', 'в_качество': 'в_качестве:gen', 'в_отношение': 'в_отношении:gen', + 'в_с': 'в:loc', # annotation error: 'в партнерстве с ACCELS' lacks the second level 'в_связь_с': 'в_связи_с:ins', 'в_случай_если': 'в_случае_если', 'в_соответствие_с': 'в_соответствии_с:ins', 'в_течение': 'в_течение:gen', 'в_тот_время_как': 'в_то_время_как', + 'в_угода': 'в_угоду:dat', 'в_ход': 'в_ходе:gen', 'вблизи': 'вблизи:gen', 'вместо': 'вместо:gen', @@ -72,6 +75,7 @@ class FixEdeprels(Block): 'около': 'около:gen', 'от': 'от:gen', 'относительно': 'относительно:gen', + 'перед': 'перед:ins', 'по_мера': 'по_мере:gen', 'по_мера_то_как': 'по_мере_того_как', 'по_отношение_ко?': 'по_отношению_к:dat', @@ -124,6 +128,9 @@ def process_node(self, node): edep['deprel'] = re.sub(r':(быть|столько).*', '', edep['deprel']) # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) + # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). + edep['deprel'] = re.sub(r'^advcl:перед', r'obl:перед', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl(?::relcl)?):перед', r'nmod:перед', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. # Similarly, 'то' occurs in 'то...то' and should be removed. From b164b5dc65d2c8d71ed1a59867b5033574a99cfe Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 27 Apr 2022 21:37:22 +0200 Subject: [PATCH 168/871] Fixed some Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index ebf8d213..1946faf0 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -18,9 +18,11 @@ class FixEdeprels(Block): 'если': [], 'как': ['как_только'], 'когда': [], + 'менее_чем': [], 'минус': [], 'нежели': [], 'плюс': [], + 'пока': [], 'потому_что': [], 'пусть': [], 'равно_как': [], @@ -39,8 +41,10 @@ class FixEdeprels(Block): # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { + 'versus': 'версус:nom', 'loc': 'в:loc', 'в_вид': 'в_виде:gen', + 'в_во_глава': 'в:acc', # annotation error: 'входил в группу во главе с геологом' 'в_качество': 'в_качестве:gen', 'в_отношение': 'в_отношении:gen', 'в_с': 'в:loc', # annotation error: 'в партнерстве с ACCELS' lacks the second level @@ -52,6 +56,7 @@ class FixEdeprels(Block): 'в_угода': 'в_угоду:dat', 'в_ход': 'в_ходе:gen', 'вблизи': 'вблизи:gen', + 'взамен': 'взамен:gen', 'вместо': 'вместо:gen', 'во_глава': 'во_главе_с:ins', 'во_глава_с': 'во_главе_с:ins', @@ -69,7 +74,9 @@ class FixEdeprels(Block): 'к': 'к:dat', 'ко': 'ко:dat', 'кроме': 'кроме:gen', + 'между_во_глава': 'между:ins', # annotation error: 'между делегацией Минобороны во главе с замминистра Владимиром Исаковым и лидером Приднестровья Игорем Смирновым' 'над': 'над:ins', # at least I have not encountered any genuine example of accusative + 'насчет': 'насчет:gen', 'несмотря_на': 'несмотря_на:acc', 'ниже': 'ниже:gen', 'около': 'около:gen', @@ -84,6 +91,7 @@ class FixEdeprels(Block): 'помимо': 'помимо:gen', 'порядка': 'порядка:gen', 'после': 'после:gen', + 'посредством_как': 'посредством:gen', 'при': 'при:loc', 'при_помощь': 'при_помощи:gen', 'при_условие_что': 'при_условии_что', @@ -125,11 +133,11 @@ def process_node(self, node): bdeprel = m.group(1) solved = False # If the marker is 'быть', discard it. It represents the phrase 'то есть', which should not be analyzed as introducing a subordinate clause. - edep['deprel'] = re.sub(r':(быть|столько).*', '', edep['deprel']) + edep['deprel'] = re.sub(r':(быть|сколь|столько).*', '', edep['deprel']) # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). - edep['deprel'] = re.sub(r'^advcl:перед', r'obl:перед', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:(взамен|на|насчет|перед|по|с|среди)', r'obl:\1', edep['deprel']) edep['deprel'] = re.sub(r'^(acl(?::relcl)?):перед', r'nmod:перед', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. From f3b8689bffdccd0cf608423b8f50deaee0419207 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 27 Apr 2022 21:41:06 +0200 Subject: [PATCH 169/871] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 1946faf0..a6c702f7 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -137,7 +137,7 @@ def process_node(self, node): # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). - edep['deprel'] = re.sub(r'^advcl:(взамен|на|насчет|перед|по|с|среди)', r'obl:\1', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:(взамен|на|насчет|перед|по|с|среди)(:|$)', r'obl:\1', edep['deprel']) edep['deprel'] = re.sub(r'^(acl(?::relcl)?):перед', r'nmod:перед', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. From 2001bc041a17b5730f18c422eefe1fa85edf46c2 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 27 Apr 2022 22:30:45 +0200 Subject: [PATCH 170/871] Fixed some Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index a6c702f7..1fd649d3 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -15,6 +15,7 @@ class FixEdeprels(Block): 'более_чем': [], 'будто': [], 'ведь': [], + 'ежели': [], 'если': [], 'как': ['как_только'], 'когда': [], @@ -45,13 +46,16 @@ class FixEdeprels(Block): 'loc': 'в:loc', 'в_вид': 'в_виде:gen', 'в_во_глава': 'в:acc', # annotation error: 'входил в группу во главе с геологом' + 'в_для': 'в:acc', 'в_качество': 'в_качестве:gen', 'в_отношение': 'в_отношении:gen', 'в_с': 'в:loc', # annotation error: 'в партнерстве с ACCELS' lacks the second level 'в_связь_с': 'в_связи_с:ins', 'в_случай_если': 'в_случае_если', + 'в_случай_когда': 'в_случае_когда', 'в_соответствие_с': 'в_соответствии_с:ins', 'в_течение': 'в_течение:gen', + 'в_то_быть': 'в:loc', 'в_тот_время_как': 'в_то_время_как', 'в_угода': 'в_угоду:dat', 'в_ход': 'в_ходе:gen', @@ -67,10 +71,12 @@ class FixEdeprels(Block): 'вроде': 'вроде:gen', 'выше': 'выше:gen', 'для': 'для:gen', + 'для_в': 'для:gen', 'до': 'до:gen', 'до_то_как': 'до:gen', # до того, как ... 'за_исключение': 'за_исключением:gen', 'из': 'из:gen', + 'из_более_чем': 'из:gen', 'к': 'к:dat', 'ко': 'ко:dat', 'кроме': 'кроме:gen', @@ -103,6 +109,7 @@ class FixEdeprels(Block): 'со_сторона': 'со_стороны:gen', 'согласно': 'согласно:dat', 'спустя': 'спустя:acc', + 'среди': 'среди:gen', 'у': 'у:gen', 'через': 'через:acc', 'чтоб': 'чтобы' From fc687fcbc394be78aeb15186402f967b68c10b9e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 27 Apr 2022 22:33:54 +0200 Subject: [PATCH 171/871] prevent duplicates in `node.coref_mentions` fix #106 --- udapi/core/coref.py | 2 +- udapi/core/tests/test_coref.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index ff66c77f..edd297b4 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -642,7 +642,7 @@ def load_coref_from_misc(doc, strict=True): _error(f"Invalid head_idx={head_idx} for {mention.entity.eid} " f"closed at {node} with words={mention._words}", 1) else: - mention = CorefMention(words=[node], entity=entity) + mention = CorefMention(words=[node], entity=entity, add_word_backlinks=False) if other: mention._other = other if subspan_idx: diff --git a/udapi/core/tests/test_coref.py b/udapi/core/tests/test_coref.py index 6142d1f8..369e8caf 100755 --- a/udapi/core/tests/test_coref.py +++ b/udapi/core/tests/test_coref.py @@ -17,6 +17,10 @@ def test_load(self): coref_entities = docs[-1].coref_entities self.assertEqual(len(coref_entities), 1) self.assertEqual(coref_entities[0].eid, 'e36781') + node = next(docs[-1].nodes) + self.assertEqual(len(node.coref_entities), 1) + self.assertEqual(len(node.coref_mentions), 1) + self.assertEqual(node.coref_entities[0], coref_entities[0]) def test_edits(self): data_filename = os.path.join(os.path.dirname(__file__), 'data', 'fr-democrat-dev-sample.conllu') From 641a8d48526e6e0c7623e78b643e56e3df268efa Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 09:08:34 +0200 Subject: [PATCH 172/871] Fixed some Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 1fd649d3..7a1d36b2 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -19,11 +19,13 @@ class FixEdeprels(Block): 'если': [], 'как': ['как_только'], 'когда': [], + 'кроме_как': [], 'менее_чем': [], 'минус': [], 'нежели': [], 'плюс': [], 'пока': [], + 'поскольку': [], 'потому_что': [], 'пусть': [], 'равно_как': [], @@ -33,7 +35,9 @@ class FixEdeprels(Block): 'хоть': [], 'хотя': [], 'чем': [], - 'что': [] + 'что': [], + 'чтобы': [], + 'яко': [] } # Secondary prepositions sometimes have the lemma of the original part of @@ -79,14 +83,19 @@ class FixEdeprels(Block): 'из_более_чем': 'из:gen', 'к': 'к:dat', 'ко': 'ко:dat', + 'коли_скоро': 'коль_скоро', 'кроме': 'кроме:gen', 'между_во_глава': 'между:ins', # annotation error: 'между делегацией Минобороны во главе с замминистра Владимиром Исаковым и лидером Приднестровья Игорем Смирновым' + 'на_вперед': 'на:acc', 'над': 'над:ins', # at least I have not encountered any genuine example of accusative 'насчет': 'насчет:gen', 'несмотря_на': 'несмотря_на:acc', 'ниже': 'ниже:gen', 'около': 'около:gen', 'от': 'от:gen', + 'от_до': 'от:gen', + 'от_от': 'от:gen', + 'от_с': 'от:gen', 'относительно': 'относительно:gen', 'перед': 'перед:ins', 'по_мера': 'по_мере:gen', @@ -103,14 +112,23 @@ class FixEdeprels(Block): 'при_условие_что': 'при_условии_что', 'про': 'про:acc', 'против': 'против:gen', + 'с_более_чем': 'с:gen', + 'с_во_глава': 'с:ins', + 'с_на': 'с:par', 'с_помощь': 'с_помощью:gen', + 'с_тем': 'с:ins', 'с_тот_пора_как': 'с_тех_пор_как', + 'с_что': 'с:ins', 'свыше': 'свыше:gen', 'со_сторона': 'со_стороны:gen', 'согласно': 'согласно:dat', 'спустя': 'спустя:acc', 'среди': 'среди:gen', + 'среди_в': 'среди:gen', + 'так_чтобы': 'чтобы', + 'тем_между': 'между:ins', 'у': 'у:gen', + 'у_без': 'у:gen', 'через': 'через:acc', 'чтоб': 'чтобы' } @@ -140,7 +158,7 @@ def process_node(self, node): bdeprel = m.group(1) solved = False # If the marker is 'быть', discard it. It represents the phrase 'то есть', which should not be analyzed as introducing a subordinate clause. - edep['deprel'] = re.sub(r':(быть|сколь|столько).*', '', edep['deprel']) + edep['deprel'] = re.sub(r':(быть|сколь|столько|типа).*', '', edep['deprel']) # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). From a8fea33f756cc83eec4f0ce71d5e1ee912456cbc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 09:18:14 +0200 Subject: [PATCH 173/871] Fixed some Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 7a1d36b2..383cdfdd 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -162,8 +162,8 @@ def process_node(self, node): # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). - edep['deprel'] = re.sub(r'^advcl:(взамен|на|насчет|перед|по|с|среди)(:|$)', r'obl:\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl(?::relcl)?):перед', r'nmod:перед', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'obl:\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl(?::relcl)?):(взамен|для|до|на|насчет|от|перед|по|с|среди)', r'nmod:\1', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. # Similarly, 'то' occurs in 'то...то' and should be removed. From e938f67739a6633db8624aa110bf386840fcbb7b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 09:21:41 +0200 Subject: [PATCH 174/871] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 383cdfdd..f7701b0f 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -163,7 +163,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'obl:\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl(?::relcl)?):(взамен|для|до|на|насчет|от|перед|по|с|среди)', r'nmod:\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl(?::relcl)?):(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'nmod:\1', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. # Similarly, 'то' occurs in 'то...то' and should be removed. From 9da0f2d79e96f20efc9e4980ea66cb05bf4c5ea4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 09:24:52 +0200 Subject: [PATCH 175/871] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index f7701b0f..77813cb2 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -162,8 +162,8 @@ def process_node(self, node): # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). - edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'obl:\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl(?::relcl)?):(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'nmod:\1', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'obl:\1\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl(?::relcl)?):(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'nmod:\1\2', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. # Similarly, 'то' occurs in 'то...то' and should be removed. From 8aa69eb0b22941fa1ef52278cdadcaaee35a7823 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 09:29:21 +0200 Subject: [PATCH 176/871] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 77813cb2..0cb84264 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -163,7 +163,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'obl:\1\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl(?::relcl)?):(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'nmod:\1\2', edep['deprel']) + edep['deprel'] = re.sub(r'^acl(?::relcl)?:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'nmod:\1\2', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. # Similarly, 'то' occurs in 'то...то' and should be removed. From 5f81f7ca1c55f44ad1649bfe0ddb660a5402a88c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 09:55:50 +0200 Subject: [PATCH 177/871] Fixed some Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 0cb84264..977805c4 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -161,9 +161,10 @@ def process_node(self, node): edep['deprel'] = re.sub(r':(быть|сколь|столько|типа).*', '', edep['deprel']) # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^ccomp:чтобы$', r'ccomp', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). - edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'obl:\1\2', edep['deprel']) - edep['deprel'] = re.sub(r'^acl(?::relcl)?:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'nmod:\1\2', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'obl:\1\2', edep['deprel']) + edep['deprel'] = re.sub(r'^acl(?::relcl)?:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'nmod:\1\2', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. # Similarly, 'то' occurs in 'то...то' and should be removed. From 84e050203be61981fb4f482b32956d2545aa7859 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 10:21:32 +0200 Subject: [PATCH 178/871] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 977805c4..d6e99eed 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -153,6 +153,8 @@ def process_node(self, node): abbreviation and its morphological case is unknown. """ for edep in node.deps: + # Although in theory allowed by the EUD guidelines, Russian does not enhance the ccomp relation with case markers. + edep['deprel'] = re.sub(r'^ccomp:чтобы$', r'ccomp', edep['deprel']) m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) if m: bdeprel = m.group(1) @@ -161,7 +163,6 @@ def process_node(self, node): edep['deprel'] = re.sub(r':(быть|сколь|столько|типа).*', '', edep['deprel']) # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^ccomp:чтобы$', r'ccomp', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'obl:\1\2', edep['deprel']) edep['deprel'] = re.sub(r'^acl(?::relcl)?:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'nmod:\1\2', edep['deprel']) @@ -184,7 +185,7 @@ def process_node(self, node): for x in self.unambiguous: # All secondary prepositions have only one fixed morphological case # they appear with, so we can replace whatever case we encounter with the correct one. - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|par|dat|acc|voc|loc|ins))?$', edep['deprel']) if m: edep['deprel'] = m.group(1)+':'+self.unambiguous[x] solved = True From 2220946afac8a01650e6e0292356fa3e6ff5463a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 10:34:36 +0200 Subject: [PATCH 179/871] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index d6e99eed..6fa73460 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -76,10 +76,8 @@ class FixEdeprels(Block): 'выше': 'выше:gen', 'для': 'для:gen', 'для_в': 'для:gen', - 'до': 'до:gen', 'до_то_как': 'до:gen', # до того, как ... 'за_исключение': 'за_исключением:gen', - 'из': 'из:gen', 'из_более_чем': 'из:gen', 'к': 'к:dat', 'ко': 'ко:dat', @@ -92,7 +90,6 @@ class FixEdeprels(Block): 'несмотря_на': 'несмотря_на:acc', 'ниже': 'ниже:gen', 'около': 'около:gen', - 'от': 'от:gen', 'от_до': 'от:gen', 'от_от': 'от:gen', 'от_с': 'от:gen', @@ -193,8 +190,16 @@ def process_node(self, node): if solved: continue # The following prepositions have more than one morphological case - # available. Thanks to the Case feature on prepositions, we can - # identify the correct one. + # available. + m = re.match(r'^(obl(?::arg)?|nmod):(до|из|от)(?::(?:nom|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or partitive are possible. Pick genitive. + edep['deprel'] = m.group(1)+':'+m.group(2)+':gen' + continue # Both "на" and "в" also occur with genitive. However, this # is only because there are numerals in the phrase ("в 9 случаев из 10") # and the whole phrase should not be analyzed as genitive. From b6a799c4e0bf82c8add59aa4cf3c64cfd6bedf5f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 13 May 2022 11:03:04 +0200 Subject: [PATCH 180/871] A block to fix Russian "to est" from mark to cc. --- udapi/block/ud/ru/fixtoest.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 udapi/block/ud/ru/fixtoest.py diff --git a/udapi/block/ud/ru/fixtoest.py b/udapi/block/ud/ru/fixtoest.py new file mode 100644 index 00000000..1b603e96 --- /dev/null +++ b/udapi/block/ud/ru/fixtoest.py @@ -0,0 +1,35 @@ +"""Block to fix annotation of то есть in Russian.""" +from udapi.core.block import Block +import logging +import re + +class FixToEst(Block): + + def process_node(self, node): + """ + In the converted data from Kira, the fixed expression "то есть" ("that is") + is treated as a subordinator and attached as "mark", which later makes it + part of complex enhanced relation labels. I believe that this analysis is + wrong and that it will be better to label these expressions as "cc". + """ + if node.udeprel == 'mark' and node.lemma == 'то': + if len([c for c in node.children if c.udeprel == 'fixed' and c.lemma == 'быть']) > 0: + self.set_basic_and_enhanced(node, node.parent, 'cc', 'cc') + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) From 4e97deca1849ab559ad3a2e12ca091fac56213df Mon Sep 17 00:00:00 2001 From: Daniel Hershcovich Date: Wed, 18 May 2022 14:27:57 +0200 Subject: [PATCH 181/871] Support printing enhanced graphs in Tikz (#107) --- udapi/block/write/tikz.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/udapi/block/write/tikz.py b/udapi/block/write/tikz.py index 43417c61..40071739 100644 --- a/udapi/block/write/tikz.py +++ b/udapi/block/write/tikz.py @@ -39,7 +39,8 @@ class Tikz(BaseWriter): """ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, - attributes=None, as_tree=False, comment_attribute=None, **kwargs): + attributes=None, as_tree=False, comment_attribute=None, + enhanced=False, **kwargs): """Create the Tikz block object. Args: @@ -50,6 +51,7 @@ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, attributes: comma-separated list of node attributes to print (each on a separate line). as_tree: boolean - should print it as a 2D tree? comment_attribute: which attribute to print as a string under each graph (e.g. text_en) + enhanced: boolean - print the enhanced graph below the sentence, too? """ super().__init__(**kwargs) self.print_sent_id = print_sent_id @@ -63,6 +65,9 @@ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, self.node_attributes = 'form,upos'.split(',') self.as_tree = as_tree self.comment_attribute = comment_attribute + if as_tree and enhanced: + raise ValueError("The enhanced graph cannot be printed as a tree") + self.enhanced = enhanced def before_process_document(self, doc): super().before_process_document(doc) @@ -140,6 +145,12 @@ def process_tree(self, tree): print(r'\deproot{%d}{root}' % node.ord) else: print(r'\depedge{%d}{%d}{%s}' % (node.parent.ord, node.ord, node.deprel)) + if self.enhanced: + for dep in node.deps: + if dep['parent'].is_root(): + print(r'\deproot[edge below]{%d}{root}' % node.ord) + else: + print(r'\depedge[edge below]{%d}{%d}{%s}' % (dep['parent'].ord, node.ord, dep['deprel'])) if self.comment_attribute and tree.comment: start_pos = tree.comment.find(self.comment_attribute + ' = ') if start_pos != -1: From 3688e4cc0be8e324c6ba87c1b96ca2b9e5bd8b16 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 May 2022 10:09:26 +0200 Subject: [PATCH 182/871] A block to check for Czech-specific bugs in feature values. --- udapi/block/ud/cs/markfeatsbugs.py | 217 +++++++++++++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 udapi/block/ud/cs/markfeatsbugs.py diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py new file mode 100644 index 00000000..b4b6ccfa --- /dev/null +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -0,0 +1,217 @@ +""" +Block to identify missing or ill-valued features in Czech. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. +""" +from udapi.core.block import Block +import logging +import re + +class MarkFeatsBugs(Block): + + allowed = { + 'NOUN': {'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}, + 'ADJ': {'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names + 'NumType': ['Ord'], + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Tense': ['Pres', 'Past'], + 'Voice': ['Act', 'Pass'], + 'Foreign': ['Yes']} + } + + required = { + 'NOUN': ['Gender', 'Number', 'Case', 'Polarity'], + 'ADJ': ['Gender', 'Number', 'Case', 'Degree', 'Polarity'] + } + + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def check_allowed_features(self, node, allowed): + """ + We need a dictionary indexed by feature names that are allowed; for each + feature name, there is a list of allowed values. + """ + # Check for features that are not allowed but the node has them. + # For features that are allowed, check that their values are allowed. + for f in node.feats: + if f in allowed: + if not node.feats[f] in allowed[f]: + self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed') + else: + self.bug(node, 'Feat' + f + 'NotAllowed') + + def check_required_features(self, node, required): + """ + We need a list of names of features whose values must not be empty. + """ + for f in required: + if not f in node.feats: + self.bug(node, 'Feat' + f + 'Missing') + + def process_node(self, node): + if node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + elif node.upos == 'PROPN': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'NameType': ['Giv', 'Sur', 'Geo'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'NameType': ['Giv', 'Sur', 'Geo'], + 'Foreign': ['Yes']}) + elif node.upos == 'ADJ': + if node.feats['Poss'] == 'Yes': # possessive adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names + 'Foreign': ['Yes']}) + elif node.feats['NumType'] == 'Ord': # ordinal numerals are a subtype of adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act', 'Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act', 'Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + elif node.feats['Variant'] == 'Short': # short (nominal) forms of adjectives have no degree + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Variant']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity', 'Variant']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: # regular adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) From 0116b9c71d603aa7e9111a598dc1fe8f0865a550 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 May 2022 16:41:46 +0200 Subject: [PATCH 183/871] Rules for features in Czech. --- udapi/block/ud/cs/markfeatsbugs.py | 303 ++++++++++++++++++++++++++--- 1 file changed, 279 insertions(+), 24 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index b4b6ccfa..c71ccd0f 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -70,6 +70,7 @@ def check_required_features(self, node, required): self.bug(node, 'Feat' + f + 'Missing') def process_node(self, node): + # NOUNS ################################################################ if node.upos == 'NOUN': self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) if node.feats['Gender'] == 'Masc': @@ -88,6 +89,7 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], 'Foreign': ['Yes']}) + # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) if node.feats['Gender'] == 'Masc': @@ -108,6 +110,7 @@ def process_node(self, node): 'Polarity': ['Pos', 'Neg'], 'NameType': ['Giv', 'Sur', 'Geo'], 'Foreign': ['Yes']}) + # ADJECTIVES ########################################################### elif node.upos == 'ADJ': if node.feats['Poss'] == 'Yes': # possessive adjectives if node.feats['Gender'] == 'Masc': @@ -150,31 +153,61 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Foreign': ['Yes']}) elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Act', 'Pass'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) + self.check_required_features(node, ['VerbForm', 'Voice']) + if node.feats['Voice'] == 'Act': # active participles have tense, passives don't + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) else: - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Act', 'Pass'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) elif node.feats['Variant'] == 'Short': # short (nominal) forms of adjectives have no degree if node.feats['Gender'] == 'Masc': self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Variant']) @@ -215,3 +248,225 @@ def process_node(self, node): 'Degree': ['Pos', 'Cmp', 'Sup'], 'Polarity': ['Pos', 'Neg'], 'Foreign': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + self.check_required_features(node, ['PronType']) + if node.feats['PronType'] == 'Prs': + if node.feats['Reflex'] == 'Yes': + self.check_required_features(node, ['PronType', 'Reflex', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Reflex': ['Yes'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + else: # not reflexive + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) + if node.feats['Person'] == '3': + if re.match(r'^(Nom|Voc)$', node.feats['Case']): + self.check_required_features(node, ['Gender']) + # In PDT, animacy of personal pronouns is distinguished only for Person=3 Case=Nom Gender=Masc Number=Plur ('oni' vs. 'ony'). + # So we will neither require nor allow it in singular and dual. + if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Voc'] + }) + else: # on, ona, ono, ony (Fem Plur) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Voc'] + }) + else: # non-nominatives also have PrepCase + # Mostly only two gender groups and no animacy: + # Masc,Neut ... jeho, jemu, jej, něm, jím + # Fem ... jí, ji, ní + # Neut ... je + self.check_required_features(node, ['PrepCase']) + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['Gender']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Gender': ['Masc,Neut', 'Fem', 'Neut'], + 'Number': ['Sing'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'PrepCase': ['Npr', 'Pre'] + }) + # No gender in dual and plural: + # Plur ... jich, jim, je, nich, jimi + else: + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # 1st and 2nd person do not have gender + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + else: # PronType není Prs + # Int,Rel ... kdo, co + # Rel ... kdož, což, jenž, ješto, jenžto, an + # Ind ... něco + # Neg ... nic, nicož + # kdo, kdož, někdo, nikdo ... Gender=Masc, Animacy=Anim, Case (but no Number; it could be used in the context of any number) + # jehožto, něhožto, jemužto, němužto ... Gender=Masc,Neut (similarly to non-nominative forms of personal pronoun 'on') + ###!!! We could make the requirements more precise if we look at the lemma. + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Neut'], + 'Animacy': ['Anim'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'PrepCase': ['Npr', 'Pre'] + }) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. + # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. + if re.match(r'^(jeho|jejich|jich|jehož|jejichž|jichž|jehožto|jejichžto|jichžto)$', node.form.lower()): + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc,Neut'] + }) + else: + # Gender is annotated in all cases in singular, but only in + # nominative, accusative (and theoretically vocative) in plural. + # Other cases (Gen, Dat, Loc, Ins) are gender-less: 'těch', 'svým', ... + # Note that this is not consistent with adjectives, where we + # disambiguate gender in all cases in plural. + # Same for animacy (which implies masculine gender). + self.check_required_features(node, ['PronType', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'], + 'Person': ['1', '2', '3'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc,Neut', 'Fem'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word', 'Digit', 'Roman'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # VERBS AND AUXILIARIES ################################################ + elif re.match(r'^(VERB|AUX)$', node.upos): + self.check_required_features(node, ['Aspect', 'VerbForm']) + if node.feats['VerbForm'] == 'Inf': + # There is no voice. For some reason, PDT does not annotate that + # the infinitive form is active (while a passive infinitive is + # a combination of the infinitive with a passive participle). + self.check_required_features(node, ['Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['VerbForm'] == 'Fin': + # Voice is optional. For some reason it is not annotated with + # imperatives (although passive imperatives are a combination + # of the active imperative and a passive participle). It is + # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'. + if node.feats['Mood'] == 'Cnd': + self.check_required_features(node, ['Mood', 'Person']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'] # optional: it is not annotated in the third person + }) + elif node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person) + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # indicative + self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Voice': ['Act'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short', 'Long'] # distinguishes sigmatic (Long) and asigmatic (Short) aorist + }) + elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # converb + self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular, and no animacy + 'Polarity': ['Pos', 'Neg'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_allowed_features(node, { + 'AdpType': ['Prep', 'Voc'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) From a7c7b145b432a86cd12ac1303cc6daf69fa54a13 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 May 2022 21:42:13 +0200 Subject: [PATCH 184/871] More precise requirements on features of Czech pronouns and determiners. --- udapi/block/ud/cs/markfeatsbugs.py | 170 ++++++++++++++++++++++++----- 1 file changed, 145 insertions(+), 25 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index c71ccd0f..a2c2bb7b 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -319,28 +319,118 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Variant': ['Short'] }) - else: # PronType není Prs - # Int,Rel ... kdo, co - # Rel ... kdož, což, jenž, ješto, jenžto, an - # Ind ... něco - # Neg ... nic, nicož - # kdo, kdož, někdo, nikdo ... Gender=Masc, Animacy=Anim, Case (but no Number; it could be used in the context of any number) - # jehožto, něhožto, jemužto, němužto ... Gender=Masc,Neut (similarly to non-nominative forms of personal pronoun 'on') - ###!!! We could make the requirements more precise if we look at the lemma. - self.check_required_features(node, ['PronType', 'Case']) + elif re.search(r'k[dt]o', node.lemma): # kdo (kto), kdož, někdo, nikdo + # There is no Number. Někdo and nikdo behave like singular; + # kdo is by default singular as well but it also occurs as a subject + # of plural verbs. + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case']) self.check_allowed_features(node, { 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Neut'], + 'Gender': ['Masc'], 'Animacy': ['Anim'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'PrepCase': ['Npr', 'Pre'] + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif re.match(r'^(co|což|něco|nicož)$', node.lemma): + # Although these pronouns behave by default as neuter singular, + # no Gender and Number is annotated. However, quite unusually, + # there is Animacy=Inan without Gender. + ###!!! This should probably be fixed in all Czech treebanks and + ###!!! in Interset. The pronoun should get Gender=Neut and no + ###!!! animacy. For now, let's at least make animacy an optional + ###!!! feature (I see that we already do not fill it in the Old + ###!!! Czech data). + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'Animacy': ['Inan'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif node.lemma == 'ješto': + # Unlike 'jenžto', this relative pronoun does not inflect, it + # always occurs in a nominative position, but the context can + # be any gender and number. + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Case': ['Nom'] }) + elif re.match(r'^(jenž|jenžto)$', node.lemma): + # The relative pronouns 'jenž', 'jenžto' inflect for gender; + # while we normally take this as a sign of DET (instead of PRON), + # these can never act as real DET because they never modify a + # nominal. + # Similarly to the personal pronoun 'on', animacy is only + # annotated for masculine nominative plural, non-nominative + # forms are merged for masculine and neuter (jehož, jemuž), and + # non-singular gender is only annotated in nominative (while + # these cases are common for all genders: jichž, jimž, jimiž). + # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even + # in the nominative, although there is no prepositional counter- + # part (but similarly the locative has no prepositionless form). + if node.feats['Case'] == 'Nom': + if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case', 'PrepCase']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Plur'], + 'Case': ['Nom'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # not Masc Plur + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case', 'PrepCase']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # not Case=Nom + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case', 'PrepCase']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # non-nominative dual or plural: jichž, nichž, jimž, nimž, jež, něž, jimiž, nimiž + self.check_required_features(node, ['PronType', 'Number', 'Case', 'PrepCase']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: + # What remains is the relative pronoun 'an'. It behaves similarly + # to 'jenž' but it does not have the PrepCase feature and it + # only occurs in the nominative. + if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': # ani + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Plur'], + 'Case': ['Nom'] + }) + else: # not Masc Plur: an, ana, ano, any + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom'] + }) # DETERMINERS ########################################################## elif node.upos == 'DET': # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. - if re.match(r'^(jeho|jejich|jich|jehož|jejichž|jichž|jehožto|jejichžto|jichžto)$', node.form.lower()): + if re.match(r'^(jeho|jejich|jich)(ž(to)?)?$', node.form.lower()): self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) self.check_allowed_features(node, { 'PronType': ['Prs', 'Rel'], @@ -349,21 +439,50 @@ def process_node(self, node): 'Number[psor]': ['Sing', 'Dual', 'Plur'], 'Gender[psor]': ['Masc,Neut'] }) - else: - # Gender is annotated in all cases in singular, but only in - # nominative, accusative (and theoretically vocative) in plural. - # Other cases (Gen, Dat, Loc, Ins) are gender-less: 'těch', 'svým', ... + elif re.match(r'^(její|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(to)?)?$', node.form.lower()): + # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' + # Gender is annotated in all cases in singular (můj, má, mé) + # but only in nominative (and vocative) in plural (mí, mé, má); + # neuter is also different in accusative (mé, má). + # Animacy is distinguished only in nom/voc plural masculine (mí, mé). + # Other cases in plural are gender-less (mých, mým, mé, mými). # Note that this is not consistent with adjectives, where we # disambiguate gender in all cases in plural. - # Same for animacy (which implies masculine gender). - self.check_required_features(node, ['PronType', 'Number', 'Case']) + self.check_required_features(node, ['PronType', 'Poss', 'Number', 'Case']) self.check_allowed_features(node, { - 'PronType': ['Prs', 'Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'PronType': ['Prs'], 'Poss': ['Yes'], 'Reflex': ['Yes'], - 'Person': ['1', '2', '3'], - 'Number[psor]': ['Sing', 'Dual', 'Plur'], - 'Gender[psor]': ['Masc,Neut', 'Fem'], + 'Person': ['1', '2'], # only if not reflexive + 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + # Gender is annotated in all cases in singular (ten, ta, to) + # but only in nominative (and vocative) in plural (ti, ty, ta); + # neuter is also different in accusative (ty, ta). + # Animacy is distinguished only in nom/voc plural masculine (ti, ty). + # Other cases in plural are gender-less (těch, těm, ty, těmi). + # Note that this is not consistent with adjectives, where we + # disambiguate gender in all cases in plural. + self.check_required_features(node, ['PronType', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], @@ -457,12 +576,13 @@ def process_node(self, node): # ADVERBS ############################################################## elif node.upos == 'ADV': self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg'], + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'], 'Degree': ['Pos', 'Cmp', 'Sup'], 'Polarity': ['Pos', 'Neg'] }) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': + self.check_required_features(node, ['AdpType', 'Case']) self.check_allowed_features(node, { 'AdpType': ['Prep', 'Voc'], 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'] From 08311e094b7443b7719924530f25e7c7fcc849ef Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 May 2022 23:43:30 +0200 Subject: [PATCH 185/871] More detailed feature conditions based on PDT. --- udapi/block/ud/cs/markfeatsbugs.py | 232 ++++++++++++++++++++++------- 1 file changed, 179 insertions(+), 53 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index a2c2bb7b..e027d1cb 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -441,65 +441,180 @@ def process_node(self, node): }) elif re.match(r'^(její|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(to)?)?$', node.form.lower()): # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]']) - self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], - 'Poss': ['Yes'], - 'Person': ['3'], - 'Number[psor]': ['Sing'], - 'Gender[psor]': ['Fem'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) + # Congruent gender is annotated only in singular. Masculine and + # neuter are merged even in nominative. Feminine singular does + # not distinguish case in PDT but we need it in Old Czech at + # least for 'jejiej'. + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' # Gender is annotated in all cases in singular (můj, má, mé) - # but only in nominative (and vocative) in plural (mí, mé, má); - # neuter is also different in accusative (mé, má). - # Animacy is distinguished only in nom/voc plural masculine (mí, mé). - # Other cases in plural are gender-less (mých, mým, mé, mými). + # but only in nominative, accusative, and vocative in plural + # (Nom/Voc mí, mé, má; Acc mé, má). Animacy is distinguished + # in plural if gender is distinguished and masculine; in + # singular it is distinguished only in accusative (mého, můj). + # Other cases in plural are gender-less (mých, mým, mými). # Note that this is not consistent with adjectives, where we # disambiguate gender in all cases in plural. - self.check_required_features(node, ['PronType', 'Poss', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'], - 'Person': ['1', '2'], # only if not reflexive - 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Gender', 'Number', 'Case']) + if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'], + 'Person': ['1', '2'], # only if not reflexive + 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing'], + 'Case': ['Acc'] + }) + else: + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'], + 'Person': ['1', '2'], # only if not reflexive + 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): + self.check_required_features(node, ['PronType', 'Poss', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'], + 'Person': ['1', '2'], # only if not reflexive + 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Acc', 'Voc'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'], + 'Person': ['1', '2'], # only if not reflexive + 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] + }) else: # Gender is annotated in all cases in singular (ten, ta, to) - # but only in nominative (and vocative) in plural (ti, ty, ta); - # neuter is also different in accusative (ty, ta). - # Animacy is distinguished only in nom/voc plural masculine (ti, ty). - # Other cases in plural are gender-less (těch, těm, ty, těmi). + # but only in nominative, accusative, and vocative in plural + # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished + # in plural if gender is distinguished and masculine; in + # singular it is distinguished only in accusative (toho, ten). + # Other cases in plural are gender-less (těch, těm, těmi). # Note that this is not consistent with adjectives, where we # disambiguate gender in all cases in plural. - self.check_required_features(node, ['PronType', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing'], + 'Case': ['Acc'] + }) + else: + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Acc', 'Voc'] + }) + else: + self.check_required_features(node, ['PronType', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] + }) # NUMERALS ############################################################# elif node.upos == 'NUM': self.check_required_features(node, ['NumType', 'NumForm']) - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word', 'Digit', 'Roman'], - 'NumValue': ['1,2,3'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'] + }) + else: + ###!!! Somehow the NumValue feature from PDT via Interset is useless. + # 'jeden' has Gender, Animacy, Number, Case: jeden, jedna, jedno, jednoho, jednomu, jednom, jedním, jedné, jednu, jednou, jedni, jedny, jedněch, jedněm, jedněmi. + # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma. + # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi. + # 'pět' and more have Number=Plur, Case: pět, pěti. + if node.lemma == 'jeden': + self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(dva|oba)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) # VERBS AND AUXILIARIES ################################################ elif re.match(r'^(VERB|AUX)$', node.upos): self.check_required_features(node, ['Aspect', 'VerbForm']) @@ -575,11 +690,22 @@ def process_node(self, node): }) # ADVERBS ############################################################## elif node.upos == 'ADV': - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'], - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'] - }) + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] + }) + elif node.feats['Degree'] != '': + # Adverbs that are compared can also be negated. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': self.check_required_features(node, ['AdpType', 'Case']) From 9d8dc3b0568596a1d8ff16dc9f54504ea38a583a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 27 May 2022 13:08:32 +0200 Subject: [PATCH 186/871] Fix spurious auxiliaries in Kazakh. --- udapi/block/ud/kk/fixspuriousaux.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 udapi/block/ud/kk/fixspuriousaux.py diff --git a/udapi/block/ud/kk/fixspuriousaux.py b/udapi/block/ud/kk/fixspuriousaux.py new file mode 100644 index 00000000..8a1e06c8 --- /dev/null +++ b/udapi/block/ud/kk/fixspuriousaux.py @@ -0,0 +1,24 @@ +"""Block to convert spurious auxiliaries to lexical verbs in Kazakh.""" +from udapi.core.block import Block +import logging +import re + +class FixSpuriousAux(Block): + + def process_node(self, node): + """ + Some verbs that are called auxiliary by the traditional grammar, should + be analyzed in UD as VERB + non-finite xcomp. + """ + if node.upos == 'AUX' and node.udeprel == 'aux': + # баста = start + if re.match(r'^(баста)$', node.lemma): + node.upos = 'VERB' + # The auxiliary inherits the incoming relation of its original parent. + lexverb = node.parent + node.parent = lexverb.parent + node.deprel = lexverb.deprel + # The auxiliary also inherits some but not all children of the lexical verb. + for c in lexverb.children: + if re.match(r'^(nsubj|csubj|obl|advmod|advcl|vocative|discourse|parataxis|punct)$', c.udeprel): + c.parent = node From f6cd84c05eb10f2acf47022afb6eaa2ff4195c10 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 27 May 2022 13:11:34 +0200 Subject: [PATCH 187/871] Bug fix. --- udapi/block/ud/kk/fixspuriousaux.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/kk/fixspuriousaux.py b/udapi/block/ud/kk/fixspuriousaux.py index 8a1e06c8..a2ba777c 100644 --- a/udapi/block/ud/kk/fixspuriousaux.py +++ b/udapi/block/ud/kk/fixspuriousaux.py @@ -22,3 +22,6 @@ def process_node(self, node): for c in lexverb.children: if re.match(r'^(nsubj|csubj|obl|advmod|advcl|vocative|discourse|parataxis|punct)$', c.udeprel): c.parent = node + # The lexical verb becomes an xcomp of the auxiliary. + lexverb.parent = node + lexverb.deprel = 'xcomp' From 0dbb2cf5d8897f5463dcc712049491d94206eddc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 27 May 2022 13:20:04 +0200 Subject: [PATCH 188/871] =?UTF-8?q?=D0=BA=D0=B5=D1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/kk/fixspuriousaux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/kk/fixspuriousaux.py b/udapi/block/ud/kk/fixspuriousaux.py index a2ba777c..044ff178 100644 --- a/udapi/block/ud/kk/fixspuriousaux.py +++ b/udapi/block/ud/kk/fixspuriousaux.py @@ -12,7 +12,7 @@ def process_node(self, node): """ if node.upos == 'AUX' and node.udeprel == 'aux': # баста = start - if re.match(r'^(баста)$', node.lemma): + if re.match(r'^(баста|кет)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 59a1f6c57fc6d2913fc06e4f6495f105004c2bdc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 27 May 2022 18:09:03 +0200 Subject: [PATCH 189/871] A block to remove multi-word token if it contains spaces and if its words match the space-delimited segments. --- udapi/block/ud/fixmwtspace.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 udapi/block/ud/fixmwtspace.py diff --git a/udapi/block/ud/fixmwtspace.py b/udapi/block/ud/fixmwtspace.py new file mode 100644 index 00000000..a2b7b875 --- /dev/null +++ b/udapi/block/ud/fixmwtspace.py @@ -0,0 +1,22 @@ +""" +Block ud.FixMwtSpace looks for multiword tokens whose form contains a space, +which should be avoided. If found, the block checks whether it can remove +the multiword token seamlessly, that is, whether the syntactic words correspond +to the space-delimited parts of the multiword token. If possible, the MWT +line will be removed. +""" +from udapi.core.block import Block +import re + + +class FixMwtSpace(Block): + """Try to remove multiword tokens with spaces.""" + + def process_node(self, node): + if node.multiword_token: + mwt = node.multiword_token + if re.search(r' ', mwt.form): + if node == mwt.words[0]: + wordforms = [x.form for x in mwt.words] + if ' '.join(wordforms) == mwt.form: + mwt.remove() From 29c48fa5a9002e37d14fe610d942586a06c9d328 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 16:51:48 +0200 Subject: [PATCH 190/871] Restore forms of words within multiword tokens. --- udapi/block/ud/mr/addformsinmwt.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 udapi/block/ud/mr/addformsinmwt.py diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py new file mode 100644 index 00000000..b468fb04 --- /dev/null +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -0,0 +1,27 @@ +""" +Block ud.mr.AddFormsInMwt looks for multiword tokens whose words lack forms. +Based on the form of the surface token and on the information provided in +the lemmas and UPOS, tries to reconstruct the forms of individual words. +""" +from udapi.core.block import Block +import re +import logging + + +class AddFormsInMwt(Block): + """Guess forms of syntactic worms within a multiword token.""" + + def process_node(self, node): + if node.form == '_' and node.multiword_token: + mwt = node.multiword_token + # Many multiword tokens consist of NOUN + ADP. Beware: The adposition + # may have a form different from its lemma. It happens with possessive + # postpositions चा, चे, which distinguish the gender and number of + # the possessed entity. + if len(mwt.words) == 2 and mwt.words[1].upos == 'ADP': + if mwt.form == mwt.words[0].lemma + mwt.words[1].lemma: + node.form = node.lemma + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + else: + logging.info("Cannot decompose multiword token '%s' of %d parts: %s" % (mwt.form, len(mwt.words), str([x.lemma for x in mwt.words]))) From d0e75c1f8fc89337b6ca87eb4b41b675a2eec3fd Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 16:59:38 +0200 Subject: [PATCH 191/871] Improved decomposition of X+ADP. --- udapi/block/ud/mr/addformsinmwt.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index b468fb04..26110fea 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -19,8 +19,12 @@ def process_node(self, node): # postpositions चा, चे, which distinguish the gender and number of # the possessed entity. if len(mwt.words) == 2 and mwt.words[1].upos == 'ADP': - if mwt.form == mwt.words[0].lemma + mwt.words[1].lemma: - node.form = node.lemma + m = re.match(r'^(.+)' + mwt.words[1].lemma + r'$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = node.lemma else: logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) else: From 788830b3fe3d1cd963498080b019bb7c675a6bef Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 17:07:02 +0200 Subject: [PATCH 192/871] =?UTF-8?q?=E0=A4=9A=E0=A4=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/mr/addformsinmwt.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 26110fea..be290c50 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -19,13 +19,23 @@ def process_node(self, node): # postpositions चा, चे, which distinguish the gender and number of # the possessed entity. if len(mwt.words) == 2 and mwt.words[1].upos == 'ADP': - m = re.match(r'^(.+)' + mwt.words[1].lemma + r'$', mwt.form) - if m: - if node == mwt.words[0]: - node.form = m.group(1) + if mwt.words[1].lemma == 'चा': + m = re.match(r'^(.+)(चा|चे)$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = m.group(2) else: - node.form = node.lemma - else: - logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + else: # not the possessive 'ca' + m = re.match(r'^(.+)' + mwt.words[1].lemma + r'$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = node.lemma + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) else: - logging.info("Cannot decompose multiword token '%s' of %d parts: %s" % (mwt.form, len(mwt.words), str([x.lemma for x in mwt.words]))) + logging.info("Cannot decompose multiword token '%s' of %d parts: %s" % (mwt.form, len(mwt.words), str([x.lemma + '/' + x.upos for x in mwt.words]))) From d5495edaf163f6e258147fee49f9f4bc8bc18fa0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 17:09:19 +0200 Subject: [PATCH 193/871] =?UTF-8?q?=E0=A4=9A=E0=A5=8D=E0=A4=AF=E0=A4=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/mr/addformsinmwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index be290c50..5fcb9866 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -20,7 +20,7 @@ def process_node(self, node): # the possessed entity. if len(mwt.words) == 2 and mwt.words[1].upos == 'ADP': if mwt.words[1].lemma == 'चा': - m = re.match(r'^(.+)(चा|चे)$', mwt.form) + m = re.match(r'^(.+)(चा|चे|च्या)$', mwt.form) if m: if node == mwt.words[0]: node.form = m.group(1) From 20e33c965cc18a1ca4f6664323ef0e167978e5f8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 17:11:39 +0200 Subject: [PATCH 194/871] =?UTF-8?q?=E0=A4=9A=E0=A5=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/mr/addformsinmwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 5fcb9866..903409f0 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -20,7 +20,7 @@ def process_node(self, node): # the possessed entity. if len(mwt.words) == 2 and mwt.words[1].upos == 'ADP': if mwt.words[1].lemma == 'चा': - m = re.match(r'^(.+)(चा|चे|च्या)$', mwt.form) + m = re.match(r'^(.+)(चा|चे|च्या|ची)$', mwt.form) if m: if node == mwt.words[0]: node.form = m.group(1) From fc5b262e7b65d8911a80f3409e28badd714f9b03 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 17:23:28 +0200 Subject: [PATCH 195/871] Particle "ca". --- udapi/block/ud/mr/addformsinmwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 903409f0..44802762 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -18,7 +18,7 @@ def process_node(self, node): # may have a form different from its lemma. It happens with possessive # postpositions चा, चे, which distinguish the gender and number of # the possessed entity. - if len(mwt.words) == 2 and mwt.words[1].upos == 'ADP': + if len(mwt.words) == 2 and re.match(r'^(ADP|PART)$', mwt.words[1].upos): if mwt.words[1].lemma == 'चा': m = re.match(r'^(.+)(चा|चे|च्या|ची)$', mwt.form) if m: From 31e99b45980823b9bb3f0a2a82b6bd40872b0801 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 17:45:32 +0200 Subject: [PATCH 196/871] Possessive pronouns. --- udapi/block/ud/mr/addformsinmwt.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 44802762..7efe1e72 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -20,12 +20,25 @@ def process_node(self, node): # the possessed entity. if len(mwt.words) == 2 and re.match(r'^(ADP|PART)$', mwt.words[1].upos): if mwt.words[1].lemma == 'चा': - m = re.match(r'^(.+)(चा|चे|च्या|ची)$', mwt.form) + # चा (cā) ... Masc Sing + # ची (cī) ... Fem Sing, Neut Plur + # चे (ce) ... Neut Sing, Masc Plur + # च्या (cyā) ... Fem Plur + m = re.match(r'^(.+)(चा|ची|चे|च्या)$', mwt.form) + # The resulting form is different with personal pronouns. + # माझा (mājhā), माझी (mājhī), माझे (mājhe), माझ्या (mājhyā) + # तुझी (tujhī), तुझे (tujhe) + m2 = re.match(r'^(माझ|तुझ)(ा|ी|े|्या)$', mwt.form) if m: if node == mwt.words[0]: node.form = m.group(1) else: node.form = m.group(2) + elif m2: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = 'च' + m.group(2) else: logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) else: # not the possessive 'ca' From 3a33d279bd2e2b3a99c325d4921a314941217c98 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 17:46:23 +0200 Subject: [PATCH 197/871] Bug fix. --- udapi/block/ud/mr/addformsinmwt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 7efe1e72..e3fcafe5 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -36,9 +36,9 @@ def process_node(self, node): node.form = m.group(2) elif m2: if node == mwt.words[0]: - node.form = m.group(1) + node.form = m2.group(1) else: - node.form = 'च' + m.group(2) + node.form = 'च' + m2.group(2) else: logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) else: # not the possessive 'ca' From a454721a0efbf19adfa2e973aaa03608052fc665 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 18:13:28 +0200 Subject: [PATCH 198/871] =?UTF-8?q?=E0=A4=9A=E0=A4=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/mr/addformsinmwt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index e3fcafe5..abf538d8 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -24,10 +24,12 @@ def process_node(self, node): # ची (cī) ... Fem Sing, Neut Plur # चे (ce) ... Neut Sing, Masc Plur # च्या (cyā) ... Fem Plur - m = re.match(r'^(.+)(चा|ची|चे|च्या)$', mwt.form) + # चं (caṁ) ... ? + m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)$', mwt.form) # The resulting form is different with personal pronouns. # माझा (mājhā), माझी (mājhī), माझे (mājhe), माझ्या (mājhyā) # तुझी (tujhī), तुझे (tujhe) + # त्याचं (tyācaṁ) m2 = re.match(r'^(माझ|तुझ)(ा|ी|े|्या)$', mwt.form) if m: if node == mwt.words[0]: From 59fb2e823e9b34679781422ef9f8a73329d97c54 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 18:19:31 +0200 Subject: [PATCH 199/871] Fix wrong lemma. --- udapi/block/ud/mr/addformsinmwt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index abf538d8..7ecb64b4 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -19,7 +19,9 @@ def process_node(self, node): # postpositions चा, चे, which distinguish the gender and number of # the possessed entity. if len(mwt.words) == 2 and re.match(r'^(ADP|PART)$', mwt.words[1].upos): - if mwt.words[1].lemma == 'चा': + # Occasionally the lemma of the possessive postposition is mistakenly 'ची' instead of 'चा'. + if mwt.words[1].lemma == 'चा' or mwt.words[1].lemma == 'ची': + mwt.words[1].lemma = 'चा' # चा (cā) ... Masc Sing # ची (cī) ... Fem Sing, Neut Plur # चे (ce) ... Neut Sing, Masc Plur From a2077ea36ff859ecee9f89445e000c933969d50c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 18:30:12 +0200 Subject: [PATCH 200/871] Three-word tokens. --- udapi/block/ud/mr/addformsinmwt.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 7ecb64b4..b899b55c 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -54,5 +54,26 @@ def process_node(self, node): node.form = node.lemma else: logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + elif len(mwt.words) == 3 and re.match(r'^(ADP|PART)$', mwt.words[1].upos) and re.match(r'^(ADP|PART)$', mwt.words[2].upos): + # Compound postpositions where the middle word is the possessive 'चा'. + if mwt.words[1].lemma == 'चा': + m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)(.+)$', mwt.form) + m2 = re.match(r'^(माझ|तुझ)(ा|ी|े|्या)(.+)$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + elif node == mwt.words[1]: + node.form = m.group(2) + else: + node.form = m.group(3) + elif m2: + if node == mwt.words[0]: + node.form = m2.group(1) + elif node == mwt.words[1]: + node.form = 'च' + m2.group(2) + else: + node.form = m2.group(3) + else: + logging.info("Cannot decompose %s+%s+%s multiword token '%s'. Part lemmas are '%s', '%s', and '%s'." % (mwt.words[0].upos, mwt.words[1].upos, mwt.words[2].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma, mwt.words[1].lemma)) else: logging.info("Cannot decompose multiword token '%s' of %d parts: %s" % (mwt.form, len(mwt.words), str([x.lemma + '/' + x.upos for x in mwt.words]))) From 3441331c8a4fa39c64651a6d8409007d61847aa8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 18:55:19 +0200 Subject: [PATCH 201/871] The honorific pronoun. --- udapi/block/ud/mr/addformsinmwt.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index b899b55c..0ad7fded 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -31,8 +31,9 @@ def process_node(self, node): # The resulting form is different with personal pronouns. # माझा (mājhā), माझी (mājhī), माझे (mājhe), माझ्या (mājhyā) # तुझी (tujhī), तुझे (tujhe) + # आपला (āpalā), आपली (āpalī), आपल्या (āpalyā) # त्याचं (tyācaṁ) - m2 = re.match(r'^(माझ|तुझ)(ा|ी|े|्या)$', mwt.form) + m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)$', mwt.form) if m: if node == mwt.words[0]: node.form = m.group(1) @@ -58,7 +59,7 @@ def process_node(self, node): # Compound postpositions where the middle word is the possessive 'चा'. if mwt.words[1].lemma == 'चा': m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)(.+)$', mwt.form) - m2 = re.match(r'^(माझ|तुझ)(ा|ी|े|्या)(.+)$', mwt.form) + m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)$', mwt.form) if m: if node == mwt.words[0]: node.form = m.group(1) From 15cf8914cfd75a14eb88bf2ff6278ba7741637a7 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 18:57:15 +0200 Subject: [PATCH 202/871] Bug fix. --- udapi/block/ud/mr/addformsinmwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 0ad7fded..f508076c 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -59,7 +59,7 @@ def process_node(self, node): # Compound postpositions where the middle word is the possessive 'चा'. if mwt.words[1].lemma == 'चा': m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)(.+)$', mwt.form) - m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)$', mwt.form) + m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)(.+)$', mwt.form) if m: if node == mwt.words[0]: node.form = m.group(1) From 940a2c6dff3895c5cad7473dcaa566a47c157390 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 19:53:17 +0200 Subject: [PATCH 203/871] -vara. --- udapi/block/ud/mr/addformsinmwt.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index f508076c..2df0b2e3 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -46,7 +46,16 @@ def process_node(self, node): node.form = 'च' + m2.group(2) else: logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) - else: # not the possessive 'ca' + elif mwt.words[1].lemma == 'वरती': + m = re.match(r'^(.+)वर$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = 'वर' + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + else: # not the possessive 'चा' m = re.match(r'^(.+)' + mwt.words[1].lemma + r'$', mwt.form) if m: if node == mwt.words[0]: From 001e7f2cb531540633f4c667eb0b111f78709a0c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 19:54:49 +0200 Subject: [PATCH 204/871] =?UTF-8?q?+=20=E0=A4=A4=E0=A5=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/mr/addformsinmwt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 2df0b2e3..7077e665 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -47,12 +47,12 @@ def process_node(self, node): else: logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) elif mwt.words[1].lemma == 'वरती': - m = re.match(r'^(.+)वर$', mwt.form) + m = re.match(r'^(.+)(वर(?:ती)?)$', mwt.form) if m: if node == mwt.words[0]: node.form = m.group(1) else: - node.form = 'वर' + node.form = m.group(2) else: logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) else: # not the possessive 'चा' From d339c1030b6f8a6ad9675776f2795e6dfef88440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Barbora=20Dohnalov=C3=A1?= <71558316+kybersutr@users.noreply.github.com> Date: Tue, 31 May 2022 17:23:04 +0200 Subject: [PATCH 205/871] Entity setter (#108) * add possibility to change the entity of a mention * keep mentions sorted * remove text * self instead of mention * add warning and tests Co-authored-by: Kybersutr --- udapi/core/coref.py | 8 ++++++-- udapi/core/tests/test_coref.py | 5 +++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index edd297b4..3eb76db3 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -103,6 +103,7 @@ import collections.abc import copy import logging +import bisect @functools.total_ordering class CorefMention(object): @@ -192,9 +193,12 @@ def entity(self): @entity.setter def entity(self, new_entity): if self._entity is not None: - raise NotImplementedError('changing the entity of a mention not supported yet') + original_entity = self._entity + original_entity._mentions.remove(self) + if not original_entity._mentions: + logging.warning(f"Original entity {original_entity.eid} is now empty.") self._entity = new_entity - new_entity._mentions.append(new_entity) + bisect.insort(new_entity._mentions, self) @property def bridging(self): diff --git a/udapi/core/tests/test_coref.py b/udapi/core/tests/test_coref.py index 369e8caf..e0998b75 100755 --- a/udapi/core/tests/test_coref.py +++ b/udapi/core/tests/test_coref.py @@ -48,6 +48,11 @@ def test_edits(self): self.assertEqual(new_entity.mentions[1], m1) self.assertTrue(m2 < m1) self.assertEqual(m2.words, [first_node, second_node, second_node.next_node]) + entity2 = doc.create_coref_entity() + m1.entity = entity2 + self.assertEqual(m1.entity.eid, entity2.eid) + m2.entity = entity2 + self.assertEqual(m2.entity.eid, entity2.eid) if __name__ == "__main__": From df5b371c7b1e17dbbb0b26cffe8f84243631c452 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 1 Jun 2022 14:52:30 +0200 Subject: [PATCH 206/871] util.Eval empty_nodes=1 node='my code...' allow processing empty nodes with util.Eval --- udapi/block/util/eval.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/eval.py b/udapi/block/util/eval.py index 0f80d018..df6aaabf 100644 --- a/udapi/block/util/eval.py +++ b/udapi/block/util/eval.py @@ -29,7 +29,7 @@ class Eval(Block): # pylint: disable=too-many-arguments,too-many-instance-attributes def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end=None, before_doc=None, after_doc=None, before_bundle=None, after_bundle=None, - coref_mention=None, coref_entity=None, + coref_mention=None, coref_entity=None, empty_nodes=False, expand_code=True, **kwargs): super().__init__(**kwargs) self.doc = doc @@ -44,6 +44,7 @@ def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end= self.after_bundle = after_bundle self.coref_mention = coref_mention self.coref_entity = coref_entity + self.empty_nodes = empty_nodes self.expand_code = expand_code self.count = collections.Counter() @@ -115,7 +116,8 @@ def process_tree(self, tree): exec(self.expand_eval_code(self.tree)) if self.node: - for node in tree.descendants(): + nodes = tree.descendants_and_empty if self.empty_nodes else tree.descendants + for node in nodes: this = node exec(self.expand_eval_code(self.node)) From 6e9320fbb2fdccdcfdfacc0c25d2659801b54ba6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 7 Jun 2022 10:16:59 +0200 Subject: [PATCH 207/871] Lemmatization of Cantonese. --- udapi/block/ud/yue/lemmatize.py | 43 +++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 udapi/block/ud/yue/lemmatize.py diff --git a/udapi/block/ud/yue/lemmatize.py b/udapi/block/ud/yue/lemmatize.py new file mode 100644 index 00000000..87279dc1 --- /dev/null +++ b/udapi/block/ud/yue/lemmatize.py @@ -0,0 +1,43 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + # dictionary: form --> lemma + lemma = { + '𡃁仔': '笭仔', + '仲': '重', + '企': '徛', + '係咪': '係', + '出嚟': '出唻', + '可': '可以', + '啦': '喇', + '㗎喇': '㗎嘑', + '喇': '嘑', + '嚟': '唻', + '就嚟': '就唻', + '死𡃁妹': '死笭妹', + '老豆': '老頭', + '蚊': '緡', + '蛋撻': '蛋澾', + '返嚟': '返唻', + '過嚟人': '過唻人', + '過嚟': '過唻' + } + + def process_node(self, node): + """ + Parts of the Cantonese treebank lack lemmas. Fortunately, lemmatization + of Sino-Tibetan languages is pretty straightforward most of the time, + as the lemma typically equals to the actual word form. + + For Cantonese, lemmatization includes normalization of some characters. + These are the few cases where lemma differs from the surface form. + """ + if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': + if node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form From 9c26f877cb0200f7d52b64e4563c6cdd9cc5e09a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Jun 2022 16:48:13 +0200 Subject: [PATCH 208/871] sanity check: cannot compare nodes from different documents When `node1` and `node2` are from different documents, it makes no sense to use `node1.precedes(node2)`. Sorting nodes from different docs using `precedes` could result in unexpected bugs because with the default Python `sort` even nodes from the same document may not be in the correct order (the relation was not transitive without the added sanity check). --- udapi/core/node.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/core/node.py b/udapi/core/node.py index ad36aa0a..63242698 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -692,6 +692,8 @@ def precedes(self, node): return self._ord < node._ord if self._root._zone != node._root._zone: raise ValueError(f"Cannot compare word order across zones: {self} {node}") + if self._root._bundle._document is not node._root._bundle._document: + raise ValueError(f"Cannot compare word order across documents: {self} {node}") return self._root._bundle.number < node._root._bundle.number def is_leaf(self): From 02cb25bba090e49dc32bd5f74161e3266740e500 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 9 Jun 2022 01:00:03 +0200 Subject: [PATCH 209/871] 02-blocks.ipynb not finished yet --- tutorial/01-visualizing.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorial/01-visualizing.ipynb b/tutorial/01-visualizing.ipynb index 382bb11f..70bea240 100644 --- a/tutorial/01-visualizing.ipynb +++ b/tutorial/01-visualizing.ipynb @@ -526,7 +526,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the next tutorial, [02-blocks.ipynb](02-blocks.ipynb), we will explore several useful Udapi blocks, some of which may be handy when working further on Exercise 2 or similar tasks." + "In the next tutorial, 02-blocks.ipynb (not finished yet), we will explore several useful Udapi blocks, some of which may be handy when working further on Exercise 2 or similar tasks." ] } ], From 5216c81c600fd280867e5fd101ef05a87658c26a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 9 Jun 2022 01:00:51 +0200 Subject: [PATCH 210/871] 02-blocks.ipynb not finished yet --- tutorial/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorial/README.md b/tutorial/README.md index 05e96d59..425f7df5 100644 --- a/tutorial/README.md +++ b/tutorial/README.md @@ -6,4 +6,4 @@ Don't display the tutorial `ipynb` files on GitHub because it cannot render the If you don't have Jupyter installed, you can display the tutorial with https://nbviewer.jupyter.org, using the following links: - [01-visualizing.ipynb](https://nbviewer.jupyter.org/github/udapi/udapi-python/blob/master/tutorial/01-visualizing.ipynb) -- [02-blocks.ipynb](https://nbviewer.jupyter.org/github/udapi/udapi-python/blob/master/tutorial/01-blocks.ipynb) +- 02-blocks.ipynb (not finished yet) From b193c2fb5d41c075fc21b194b4ddb5b88d42a6d6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 10 Jun 2022 22:17:23 +0200 Subject: [PATCH 211/871] Fix spurious auxiliaries in Uyghur. --- udapi/block/ud/ug/fixspuriousaux.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 udapi/block/ud/ug/fixspuriousaux.py diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py new file mode 100644 index 00000000..2ac6adc2 --- /dev/null +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -0,0 +1,27 @@ +"""Block to convert spurious auxiliaries to lexical verbs in Uyghur.""" +from udapi.core.block import Block +import logging +import re + +class FixSpuriousAux(Block): + + def process_node(self, node): + """ + Some verbs that are called auxiliary by the traditional grammar, should + be analyzed in UD as VERB + non-finite xcomp. + """ + if node.upos == 'AUX' and node.udeprel == 'aux': + # بەر = give (used with actions done for the benefit of somebody) + if re.match(r'^(بەر)$', node.lemma): + node.upos = 'VERB' + # The auxiliary inherits the incoming relation of its original parent. + lexverb = node.parent + node.parent = lexverb.parent + node.deprel = lexverb.deprel + # The auxiliary also inherits some but not all children of the lexical verb. + for c in lexverb.children: + if re.match(r'^(nsubj|csubj|obl|advmod|advcl|vocative|discourse|parataxis|punct)$', c.udeprel): + c.parent = node + # The lexical verb becomes an xcomp of the auxiliary. + lexverb.parent = node + lexverb.deprel = 'xcomp' From 42cdc02d3b50ecf4d70ed68d510c9d81635c41a5 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 10 Jun 2022 22:28:33 +0200 Subject: [PATCH 212/871] =?UTF-8?q?=DA=86=D9=89=D9=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 2ac6adc2..2f2d779c 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -12,7 +12,8 @@ def process_node(self, node): """ if node.upos == 'AUX' and node.udeprel == 'aux': # بەر = give (used with actions done for the benefit of somebody) - if re.match(r'^(بەر)$', node.lemma): + # چىق = go out + if re.match(r'^(بەر|چىق)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 8a5e45f6df8f7a9770003c979180e66d274efc12 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 10 Jun 2022 22:49:14 +0200 Subject: [PATCH 213/871] =?UTF-8?q?=D9=8A=DB=88=D8=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 2f2d779c..b770edcf 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -13,7 +13,8 @@ def process_node(self, node): if node.upos == 'AUX' and node.udeprel == 'aux': # بەر = give (used with actions done for the benefit of somebody) # چىق = go out - if re.match(r'^(بەر|چىق)$', node.lemma): + # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) + if re.match(r'^(بەر|چىق|يۈر)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From c580385d1147590bb5fc43bca925f001cd623a04 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 10 Jun 2022 22:55:15 +0200 Subject: [PATCH 214/871] =?UTF-8?q?=D8=A6=D9=88=D9=84=D8=AA=DB=87=D8=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index b770edcf..c03a0e5a 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -14,7 +14,8 @@ def process_node(self, node): # بەر = give (used with actions done for the benefit of somebody) # چىق = go out # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) - if re.match(r'^(بەر|چىق|يۈر)$', node.lemma): + # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 77024b3ee2b8648c8737dfcaecd64c1b08c78220 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 10:04:06 +0200 Subject: [PATCH 215/871] =?UTF-8?q?=D8=A8=D8=A7=D9=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index c03a0e5a..8eae5f19 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -15,7 +15,8 @@ def process_node(self, node): # چىق = go out # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر)$', node.lemma): + # باق = to do ever? + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 8bc5af7019a62220f7480b9f13abccabff801ab6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 10:08:17 +0200 Subject: [PATCH 216/871] It does not work if we require upos=AUX. --- udapi/block/ud/ug/fixspuriousaux.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 8eae5f19..8ea1227e 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -10,7 +10,9 @@ def process_node(self, node): Some verbs that are called auxiliary by the traditional grammar, should be analyzed in UD as VERB + non-finite xcomp. """ - if node.upos == 'AUX' and node.udeprel == 'aux': + # Sometimes there is a double error: it should not be auxiliary, it is + # attached as aux but it is not tagged AUX. So we only look at the deprel. + if node.udeprel == 'aux': # بەر = give (used with actions done for the benefit of somebody) # چىق = go out # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) From b679662a77d3f8735521a57f2190512a8f272a70 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 10:18:53 +0200 Subject: [PATCH 217/871] _ --- udapi/block/ud/ug/fixspuriousaux.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 8ea1227e..78cb86ec 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -18,7 +18,9 @@ def process_node(self, node): # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # باق = to do ever? - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق)$', node.lemma): + # ئۆت = pass + # _ ... some putative auxiliaries do not even have a lemma + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 9003c324882539ba6201a7a18be22a08328d2c17 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 10:57:43 +0200 Subject: [PATCH 218/871] =?UTF-8?q?=D8=A8=D8=A7=D8=B4=D9=84=D9=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 78cb86ec..da40074e 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -19,8 +19,10 @@ def process_node(self, node): # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # باق = to do ever? # ئۆت = pass + # كۆرۈش = see + # باشلى = start # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_)$', node.lemma): + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From d97891b9053ad9e2f7bd14337e43e579768688a0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 11:01:08 +0200 Subject: [PATCH 219/871] =?UTF-8?q?=D8=A8=D8=A7=D8=B4=D9=84=D9=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index da40074e..4e620a2e 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -22,7 +22,7 @@ def process_node(self, node): # كۆرۈش = see # باشلى = start # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش)$', node.lemma): + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From be9d3675600e7884ca77835412d392c8a6daf817 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 11:10:28 +0200 Subject: [PATCH 220/871] =?UTF-8?q?=D9=8A=DB=95=D8=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 4e620a2e..7bc8f546 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -21,8 +21,9 @@ def process_node(self, node): # ئۆت = pass # كۆرۈش = see # باشلى = start + # يەت = be enough # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى)$', node.lemma): + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From dc347add2e736251b4822578dfc4566110e8b834 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 11:16:04 +0200 Subject: [PATCH 221/871] =?UTF-8?q?=D9=82=D8=A7=D9=8A=D8=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 7bc8f546..4cc038dc 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -17,13 +17,14 @@ def process_node(self, node): # چىق = go out # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) - # باق = to do ever? + # باق = do ever? # ئۆت = pass # كۆرۈش = see # باشلى = start # يەت = be enough + # قايت = return # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت)$', node.lemma): + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From a6880f6a483d7f129159cd5a1f48f88e9a03dcbf Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 11:21:52 +0200 Subject: [PATCH 222/871] =?UTF-8?q?=DA=86=DB=88=D8=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 4cc038dc..9ccff72c 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -23,8 +23,9 @@ def process_node(self, node): # باشلى = start # يەت = be enough # قايت = return + # چۈش = fall down # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت)$', node.lemma): + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 470cd271e3812591f78f7ddf0d651d4571c46428 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 11:31:27 +0200 Subject: [PATCH 223/871] =?UTF-8?q?=D8=A8=D8=A7=D8=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 9ccff72c..bfbf8816 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -13,7 +13,7 @@ def process_node(self, node): # Sometimes there is a double error: it should not be auxiliary, it is # attached as aux but it is not tagged AUX. So we only look at the deprel. if node.udeprel == 'aux': - # بەر = give (used with actions done for the benefit of somebody) + # بەر/بار = give (used with actions done for the benefit of somebody) # چىق = go out # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) @@ -25,7 +25,7 @@ def process_node(self, node): # قايت = return # چۈش = fall down # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش)$', node.lemma): + if re.match(r'^(بەر|بار|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 558bf49fe425f477671034eaef69c0bc67de8253 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 11:36:37 +0200 Subject: [PATCH 224/871] =?UTF-8?q?=D9=82=D9=89=D9=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index bfbf8816..dc06ade7 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -24,8 +24,9 @@ def process_node(self, node): # يەت = be enough # قايت = return # چۈش = fall down + # قىل = do # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|بار|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش)$', node.lemma): + if re.match(r'^(بەر|بار|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش|قىل)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From a9e39f6fe8c7dd2ef3f8d77d188b77f9ce0b073e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 11:49:11 +0200 Subject: [PATCH 225/871] =?UTF-8?q?=D9=83=D9=89=D8=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index dc06ade7..952644f8 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -15,6 +15,7 @@ def process_node(self, node): if node.udeprel == 'aux': # بەر/بار = give (used with actions done for the benefit of somebody) # چىق = go out + # چىقىش = come out # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # باق = do ever? @@ -25,8 +26,12 @@ def process_node(self, node): # قايت = return # چۈش = fall down # قىل = do + # چاپ = jump + # قورق = fear + # كەلتۈر = cause + # كىر = enter # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|بار|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش|قىل)$', node.lemma): + if re.match(r'^(بەر|بار|چىق|چىقىش|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش|قىل|چاپ|قورق|كەلتۈر|كىر)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 56b9dc693f1a928a719ae98eae253321bbdbf2a0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 13 Jun 2022 20:37:38 +0200 Subject: [PATCH 226/871] even single-sentence docs should have `meta["loaded_from"]` --- udapi/core/basereader.py | 1 + udapi/core/tests/test_coref.py | 1 + 2 files changed, 2 insertions(+) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 9210b910..53a1129c 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -177,6 +177,7 @@ def process_document(self, document): if root.newdoc and root.newdoc is not True: document.meta["docname"] = root.newdoc document.meta['global.Entity'] = self._global_entity + document.meta['loaded_from'] = self.filename filehandle = self.filehandle if filehandle is None: diff --git a/udapi/core/tests/test_coref.py b/udapi/core/tests/test_coref.py index e0998b75..8952d6d8 100755 --- a/udapi/core/tests/test_coref.py +++ b/udapi/core/tests/test_coref.py @@ -21,6 +21,7 @@ def test_load(self): self.assertEqual(len(node.coref_entities), 1) self.assertEqual(len(node.coref_mentions), 1) self.assertEqual(node.coref_entities[0], coref_entities[0]) + self.assertEqual(docs[-1].meta["loaded_from"], data_filename) def test_edits(self): data_filename = os.path.join(os.path.dirname(__file__), 'data', 'fr-democrat-dev-sample.conllu') From 3df722c2b9f63732efab41aa095a4922b98b088b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 14 Jun 2022 13:22:34 +0200 Subject: [PATCH 227/871] We no longer use a global list of required/allowed features per UPOS. --- udapi/block/ud/cs/markfeatsbugs.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index e027d1cb..d7854982 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -9,36 +9,6 @@ class MarkFeatsBugs(Block): - allowed = { - 'NOUN': {'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}, - 'ADJ': {'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Poss': ['Yes'], - 'Gender[psor]': ['Masc', 'Fem'], - 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names - 'NumType': ['Ord'], - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Tense': ['Pres', 'Past'], - 'Voice': ['Act', 'Pass'], - 'Foreign': ['Yes']} - } - - required = { - 'NOUN': ['Gender', 'Number', 'Case', 'Polarity'], - 'ADJ': ['Gender', 'Number', 'Case', 'Degree', 'Polarity'] - } - def bug(self, node, bugstring): bugs = [] if node.misc['Bug']: From 2d0425ac2deda455e42aadb88093171ba394d2d6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 14 Jun 2022 22:18:20 +0200 Subject: [PATCH 228/871] Better checking of required and allowed features in Czech. --- udapi/block/ud/cs/markfeatsbugs.py | 327 ++++++++++++----------------- 1 file changed, 135 insertions(+), 192 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index d7854982..b7b6c5b1 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -9,6 +9,18 @@ class MarkFeatsBugs(Block): + # The convention used in PDT is not consistent. Adjectives are fully disambiguated + # (three genders, two animacies, three numbers, seven cases), even though some + # forms are shared among many feature combinations. On the other hand, pronouns + # and determiners omit some features in the context of certain values of other + # features (e.g., gender and animacy are not distinguished in plural if the case + # is genitive, dative, locative or instrumental). + # In contrast, ČNK (CNC) fully disambiguates pronouns and determiners just like + # adjectives. + # Here we can trigger one of the two conventions. It should become a block parameter + # in the future. + pdt20 = False # True = like in PDT 2.0; False = like in ČNK + def bug(self, node, bugstring): bugs = [] if node.misc['Bug']: @@ -231,60 +243,29 @@ def process_node(self, node): 'Variant': ['Short'] }) else: # not reflexive - self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) - if node.feats['Person'] == '3': - if re.match(r'^(Nom|Voc)$', node.feats['Case']): - self.check_required_features(node, ['Gender']) - # In PDT, animacy of personal pronouns is distinguished only for Person=3 Case=Nom Gender=Masc Number=Plur ('oni' vs. 'ony'). - # So we will neither require nor allow it in singular and dual. - if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': - self.check_required_features(node, ['Animacy']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Plur'], - 'Case': ['Nom', 'Voc'] - }) - else: # on, ona, ono, ony (Fem Plur) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Voc'] - }) - else: # non-nominatives also have PrepCase + if node.feats['Person'] == '3': # on, ona, ono, oni, ony + if re.match(r'^(Nom|Voc)$', node.feats['Case']): # on, ona, ono, oni, ony + self.check_adjective_like(node, ['PronType', 'Person'], { + 'PronType': ['Prs'], + 'Person': ['3'] + }) + else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně # Mostly only two gender groups and no animacy: # Masc,Neut ... jeho, jemu, jej, něm, jím # Fem ... jí, ji, ní # Neut ... je - self.check_required_features(node, ['PrepCase']) - if node.feats['Number'] == 'Sing': - self.check_required_features(node, ['Gender']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Gender': ['Masc,Neut', 'Fem', 'Neut'], - 'Number': ['Sing'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], - 'PrepCase': ['Npr', 'Pre'] - }) # No gender in dual and plural: # Plur ... jich, jim, je, nich, jimi - else: - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], - 'PrepCase': ['Npr', 'Pre'] - }) - else: # 1st and 2nd person do not have gender + self.check_adjective_like(node, ['PronType', 'Person', 'PrepCase'], { + 'PronType': ['Prs'], + 'Person': ['3'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # 1st and 2nd person do not have gender: já, ty + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) self.check_allowed_features(node, { 'PronType': ['Prs'], - 'Person': ['1', '2', '3'], + 'Person': ['1', '2'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Variant': ['Short'] @@ -337,44 +318,10 @@ def process_node(self, node): # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even # in the nominative, although there is no prepositional counter- # part (but similarly the locative has no prepositionless form). - if node.feats['Case'] == 'Nom': - if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': - self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case', 'PrepCase']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Plur'], - 'Case': ['Nom'], - 'PrepCase': ['Npr', 'Pre'] - }) - else: # not Masc Plur - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case', 'PrepCase']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom'], - 'PrepCase': ['Npr', 'Pre'] - }) - else: # not Case=Nom - if node.feats['Number'] == 'Sing': - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case', 'PrepCase']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Gender': ['Masc,Neut', 'Fem'], - 'Number': ['Sing'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], - 'PrepCase': ['Npr', 'Pre'] - }) - else: # non-nominative dual or plural: jichž, nichž, jimž, nimž, jež, něž, jimiž, nimiž - self.check_required_features(node, ['PronType', 'Number', 'Case', 'PrepCase']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], - 'PrepCase': ['Npr', 'Pre'] - }) + self.check_adjective_like(node, ['PronType', 'PrepCase'], { + 'PronType': ['Rel'], + 'PrepCase': ['Npr', 'Pre'] + }) else: # What remains is the relative pronoun 'an'. It behaves similarly # to 'jenž' but it does not have the PrepCase feature and it @@ -439,107 +386,15 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' - # Gender is annotated in all cases in singular (můj, má, mé) - # but only in nominative, accusative, and vocative in plural - # (Nom/Voc mí, mé, má; Acc mé, má). Animacy is distinguished - # in plural if gender is distinguished and masculine; in - # singular it is distinguished only in accusative (mého, můj). - # Other cases in plural are gender-less (mých, mým, mými). - # Note that this is not consistent with adjectives, where we - # disambiguate gender in all cases in plural. - if node.feats['Number'] == 'Sing': - self.check_required_features(node, ['PronType', 'Poss', 'Gender', 'Number', 'Case']) - if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': - self.check_required_features(node, ['Animacy']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'], - 'Person': ['1', '2'], # only if not reflexive - 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing'], - 'Case': ['Acc'] - }) - else: - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'], - 'Person': ['1', '2'], # only if not reflexive - 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], - 'Number': ['Sing'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): - self.check_required_features(node, ['PronType', 'Poss', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'], - 'Person': ['1', '2'], # only if not reflexive - 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Acc', 'Voc'] - }) - else: - self.check_required_features(node, ['PronType', 'Poss', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'], - 'Person': ['1', '2'], # only if not reflexive - 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive - 'Number': ['Dual', 'Plur'], - 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] - }) + self.check_adjective_like(node, ['PronType', 'Poss'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'], + 'Person': ['1', '2'], # only if not reflexive + 'Number[psor]': ['Sing', 'Plur'] # only if not reflexive + }) else: - # Gender is annotated in all cases in singular (ten, ta, to) - # but only in nominative, accusative, and vocative in plural - # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished - # in plural if gender is distinguished and masculine; in - # singular it is distinguished only in accusative (toho, ten). - # Other cases in plural are gender-less (těch, těm, těmi). - # Note that this is not consistent with adjectives, where we - # disambiguate gender in all cases in plural. - if node.feats['Number'] == 'Sing': - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) - if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': - self.check_required_features(node, ['Animacy']) - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing'], - 'Case': ['Acc'] - }) - else: - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular - 'Number': ['Sing'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Acc', 'Voc'] - }) - else: - self.check_required_features(node, ['PronType', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] - }) + self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp']}) # NUMERALS ############################################################# elif node.upos == 'NUM': self.check_required_features(node, ['NumType', 'NumForm']) @@ -568,14 +423,25 @@ def process_node(self, node): }) elif re.match(r'^(dva|oba)$', node.lemma): self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) + if self.pdt20: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) else: self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) self.check_allowed_features(node, { @@ -686,3 +552,80 @@ def process_node(self, node): # THE REST: NO FEATURES ################################################ else: self.check_allowed_features(node, {}) + + def check_adjective_like(self, node, r0, a0): + """ + Long form of adjectives, pronouns and determiners mostly share declension + paradigms and thus the sets of features that are expected. Whether the + actual feature sets are the same depends on the tagging convention (PDT + vs. ČNK): in PDT, adjectives are fully disambiguated while pronouns are + not; in ČNK, both adjectives and pronouns (incl. determiners) are fully + disambiguated. This method defines the core inflectional features while + any extras (such as PronType for pronouns) have to be provided by the + caller in parameters r0 (list) and a0 (dict). + """ + required_features = [] + allowed_featurs = {} + full_set = node.upos == 'ADJ' or not self.pdt20 + if full_set: + # Even in the full set, animacy is only distinguished for the + # masculine gender. + if node.feats['Gender'] == 'Masc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + # Gender is annotated in all cases in singular (ten, ta, to) + # but only in nominative, accusative, and vocative in plural + # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished + # in plural if gender is distinguished and it is masculine; in + # singular it is distinguished only in accusative (toho, ten). + # Other cases in plural are gender-less (těch, těm, těmi). + # Note that this is not consistent with adjectives, where we + # disambiguate gender in all cases in plural. + if node.feats['Number'] == 'Sing': + if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing'], + 'Case': ['Acc'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Acc', 'Voc'] + } + else: + required_features = ['Number', 'Case'] + allowed_features = { + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] + } + required_features = r0 + required_features + a0.update(allowed_features) + allowed_features = a0 + self.check_required_features(node, required_features) + self.check_allowed_features(node, allowed_features) From 84334251ee1954fb42ec8cb570a38800262b68d7 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 15 Jun 2022 10:12:55 +0200 Subject: [PATCH 229/871] Distinguish reflexive and irreflexive possessives. --- udapi/block/ud/cs/markfeatsbugs.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index b7b6c5b1..78acc6f8 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -386,13 +386,19 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' - self.check_adjective_like(node, ['PronType', 'Poss'], { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'], - 'Person': ['1', '2'], # only if not reflexive - 'Number[psor]': ['Sing', 'Plur'] # only if not reflexive - }) + if node.feats['Reflex'] == 'Yes': + self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'] + }) + else: + self.check_adjective_like(node, ['PronType', 'Poss', 'Person', 'Number[psor]'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['1', '2'], + 'Number[psor]': ['Sing', 'Plur'] + }) else: self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp']}) # NUMERALS ############################################################# From 4facb6d760aa0e0aaacf55f4b699076bf39b2cb8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 15 Jun 2022 14:26:18 +0200 Subject: [PATCH 230/871] Animacy only for masculine gender, also for participles. --- udapi/block/ud/cs/markfeatsbugs.py | 36 ++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index 78acc6f8..11ecd6d9 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -508,24 +508,36 @@ def process_node(self, node): 'Variant': ['Short', 'Long'] # distinguishes sigmatic (Long) and asigmatic (Short) aorist }) elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB - self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Part'], - 'Tense': ['Past'], - 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB - 'Number': ['Sing', 'Dual', 'Plur'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Polarity': ['Pos', 'Neg'] - }) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) else: # converb self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf'], 'VerbForm': ['Conv'], 'Tense': ['Past', 'Pres'], - 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Voice': ['Act'], 'Number': ['Sing', 'Dual', 'Plur'], 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular, and no animacy 'Polarity': ['Pos', 'Neg'] From 3feff237f673d9a65434af5241e3cb317b6c4820 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 21 Jun 2022 11:24:49 +0200 Subject: [PATCH 231/871] more Windows friendly --- udapi/core/resource.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/core/resource.py b/udapi/core/resource.py index 9e5923f1..ae7320c6 100644 --- a/udapi/core/resource.py +++ b/udapi/core/resource.py @@ -12,7 +12,9 @@ def require_file(path): raise IOError(path + " does not exist") return os.path.abspath(path) udapi_data = os.environ.get('UDAPI_DATA', os.environ.get('HOME')) - full_path = udapi_data + '/' + path + if udapi_data is None: + raise IOError(f"Empty environment vars: UDAPI_DATA={os.environ.get('UDAPI_DATA')} HOME={os.environ.get('HOME')}") + full_path = os.path.join(udapi_data, path) if not os.path.isfile(full_path): logging.info('Downloading %s to %s', BASEURL + path, full_path) os.makedirs(os.path.dirname(full_path), exist_ok=True) From 4178f18d5d7527c23eea23126d8d93de0087208c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 22 Jun 2022 01:51:32 +0200 Subject: [PATCH 232/871] os.environ.get('HOME') does not work on Windows --- udapi/core/resource.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/core/resource.py b/udapi/core/resource.py index ae7320c6..da2ba561 100644 --- a/udapi/core/resource.py +++ b/udapi/core/resource.py @@ -2,6 +2,7 @@ import logging import urllib.request import os +from os.path import expanduser BASEURL = 'http://ufallab.ms.mff.cuni.cz/tectomt/share/data/' @@ -11,9 +12,9 @@ def require_file(path): if not os.path.isfile(path): raise IOError(path + " does not exist") return os.path.abspath(path) - udapi_data = os.environ.get('UDAPI_DATA', os.environ.get('HOME')) + udapi_data = os.environ.get('UDAPI_DATA', expanduser('~')) if udapi_data is None: - raise IOError(f"Empty environment vars: UDAPI_DATA={os.environ.get('UDAPI_DATA')} HOME={os.environ.get('HOME')}") + raise IOError(f"Empty environment vars: UDAPI_DATA={os.environ.get('UDAPI_DATA')} HOME={expanduser('~')}") full_path = os.path.join(udapi_data, path) if not os.path.isfile(full_path): logging.info('Downloading %s to %s', BASEURL + path, full_path) From b3f5dca4c0dfbe3e4f2fb6850a37fd41c745a0dd Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 27 Jul 2022 09:31:14 +0200 Subject: [PATCH 233/871] Catch more MWTs in Marathi. --- udapi/block/ud/mr/addformsinmwt.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 7077e665..bd63ee7d 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -66,7 +66,8 @@ def process_node(self, node): logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) elif len(mwt.words) == 3 and re.match(r'^(ADP|PART)$', mwt.words[1].upos) and re.match(r'^(ADP|PART)$', mwt.words[2].upos): # Compound postpositions where the middle word is the possessive 'चा'. - if mwt.words[1].lemma == 'चा': + # The lemma of the middle word should be 'चा' but sometimes it is 'च्या'. + if re.match(r'^(चा|च्या)$', mwt.words[1].lemma): m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)(.+)$', mwt.form) m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)(.+)$', mwt.form) if m: @@ -74,6 +75,7 @@ def process_node(self, node): node.form = m.group(1) elif node == mwt.words[1]: node.form = m.group(2) + node.lemma = 'चा' else: node.form = m.group(3) elif m2: @@ -81,9 +83,12 @@ def process_node(self, node): node.form = m2.group(1) elif node == mwt.words[1]: node.form = 'च' + m2.group(2) + node.lemma = 'चा' else: node.form = m2.group(3) else: logging.info("Cannot decompose %s+%s+%s multiword token '%s'. Part lemmas are '%s', '%s', and '%s'." % (mwt.words[0].upos, mwt.words[1].upos, mwt.words[2].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma, mwt.words[1].lemma)) + else: + logging.info("Cannot decompose %s+%s+%s multiword token '%s'. Part lemmas are '%s', '%s', and '%s'." % (mwt.words[0].upos, mwt.words[1].upos, mwt.words[2].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma, mwt.words[1].lemma)) else: logging.info("Cannot decompose multiword token '%s' of %d parts: %s" % (mwt.form, len(mwt.words), str([x.lemma + '/' + x.upos for x in mwt.words]))) From db6ae9ba76de22af008021db6774e9e1c0db26ab Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 11:09:48 +0200 Subject: [PATCH 234/871] Added a block to fix certain instances of duplicate subjects in Danish. --- udapi/block/ud/da/fixmultisubject.py | 56 ++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 udapi/block/ud/da/fixmultisubject.py diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py new file mode 100644 index 00000000..7307f0cf --- /dev/null +++ b/udapi/block/ud/da/fixmultisubject.py @@ -0,0 +1,56 @@ +""" +Block ud.da.FixMultiSubject tries to fix some systemic instances of predicates +that have more than one subject dependent. +""" +from udapi.core.block import Block + +class FixMultiSubject(Block): + """ + Make sure that a predicate has at most one subject. Note that it can + only fix instances that follow certain pattern observed in the Danish + data. + """ + + def process_node(self, node): + subjects = [x for x in node.children if re.match(r'^[nc]subj$', x.udeprel)] + if len(subjects) > 1: + # Pattern 1: A node is is attached as xcomp to the current node, and + # one of the subjects is closer to that xcomp than to the current + # node. + xcompchildren = [x for x in node.children if x.udeprel == 'xcomp'] + if len(subjects) == 2 and len(xcompchildren) == 1: + xcompnode = xcompchildren[0] + dn = [dist(node, x) for x in subjects] + dx = [dist(xcompnode, x) for x in subjects] + # Is the first subject closer to xcomp than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to xcomp? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the xcomp node. + subjects[0].parent = xcompnode + # There are typically other dependents that should belong to the xcomp node. + for c in node.children: + if dist(xcompnode, c) < dist(node, c): + c.parent = xcompnode + # The xcompnode should probably be attached as something else + # than xcomp, perhaps even the direction of the relation should + # be reversed, but one would have to resolve this manually. + xcompnode.misc['ToDo'] = 'check-xcomp' + # Is the second subject closer to xcomp than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to xcomp? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = xcompnode + # There are typically other dependents that should belong to the xcomp node. + for c in node.children: + if dist(xcompnode, c) < dist(node, c): + c.parent = xcompnode + # The xcompnode should probably be attached as something else + # than xcomp, perhaps even the direction of the relation should + # be reversed, but one would have to resolve this manually. + xcompnode.misc['ToDo'] = 'check-xcomp' + +def dist(x, y): + d = x.ord - y.ord + if d < 0: + d = -d + return d From 4c844704dbe939bc83096950e5bdfa839174b021 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 11:11:33 +0200 Subject: [PATCH 235/871] Bug fix. --- udapi/block/ud/da/fixmultisubject.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index 7307f0cf..a6709718 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -3,6 +3,7 @@ that have more than one subject dependent. """ from udapi.core.block import Block +import re class FixMultiSubject(Block): """ From a3a8b808b93ecbbd01f616b37f31a046b417b945 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 11:13:25 +0200 Subject: [PATCH 236/871] Bug fix. --- udapi/block/ud/da/fixmultisubject.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index a6709718..90ab6b7b 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -30,7 +30,7 @@ def process_node(self, node): subjects[0].parent = xcompnode # There are typically other dependents that should belong to the xcomp node. for c in node.children: - if dist(xcompnode, c) < dist(node, c): + if c != xcompnode and dist(xcompnode, c) < dist(node, c): c.parent = xcompnode # The xcompnode should probably be attached as something else # than xcomp, perhaps even the direction of the relation should @@ -43,7 +43,7 @@ def process_node(self, node): subjects[1].parent = xcompnode # There are typically other dependents that should belong to the xcomp node. for c in node.children: - if dist(xcompnode, c) < dist(node, c): + if c != xcompnode and dist(xcompnode, c) < dist(node, c): c.parent = xcompnode # The xcompnode should probably be attached as something else # than xcomp, perhaps even the direction of the relation should From 575440891e7a4e49de2364b7524e7f6b46b3e95b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 13:12:53 +0200 Subject: [PATCH 237/871] Another pattern for duplicite subjects in Danish. --- udapi/block/ud/da/fixmultisubject.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index 90ab6b7b..37fe5f13 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -19,6 +19,9 @@ def process_node(self, node): # one of the subjects is closer to that xcomp than to the current # node. xcompchildren = [x for x in node.children if x.udeprel == 'xcomp'] + # Pattern 2: Similar to pattern 1, but advcl instead of xcomp, and + # possibly not so many other mis-attached dependents. + advclchildren = [x for x in node.children if x.udeprel == 'advcl'] if len(subjects) == 2 and len(xcompchildren) == 1: xcompnode = xcompchildren[0] dn = [dist(node, x) for x in subjects] @@ -49,6 +52,20 @@ def process_node(self, node): # than xcomp, perhaps even the direction of the relation should # be reversed, but one would have to resolve this manually. xcompnode.misc['ToDo'] = 'check-xcomp' + elif len(subjects) == 2 and len(advclchildren) == 1: + advclnode = advclchildren[0] + dn = [dist(node, x) for x in subjects] + dx = [dist(xcompnode, x) for x in subjects] + # Is the first subject closer to advcl than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to advcl? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the advcl node. + subjects[0].parent = advclnode + # Is the second subject closer to advcl than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to advcl? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = advclnode def dist(x, y): d = x.ord - y.ord From d643b8d05eb698e1443b84321f2fbad4a99291ff Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 13:14:31 +0200 Subject: [PATCH 238/871] Bug fix. --- udapi/block/ud/da/fixmultisubject.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index 37fe5f13..142a51d9 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -55,7 +55,7 @@ def process_node(self, node): elif len(subjects) == 2 and len(advclchildren) == 1: advclnode = advclchildren[0] dn = [dist(node, x) for x in subjects] - dx = [dist(xcompnode, x) for x in subjects] + dx = [dist(advclnode, x) for x in subjects] # Is the first subject closer to advcl than it is to the current node? # At the same time, is the second subject closer to the current node than it is to advcl? if dx[0] < dn[0] and dn[1] < dx[1]: From 7ca292f2282c6627eff720316707f2ab66b0b519 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 13:19:53 +0200 Subject: [PATCH 239/871] Improvement: The problematic predicate can now have multiple advcl dependents. --- udapi/block/ud/da/fixmultisubject.py | 30 +++++++++++++++------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index 142a51d9..453bc1c0 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -52,20 +52,22 @@ def process_node(self, node): # than xcomp, perhaps even the direction of the relation should # be reversed, but one would have to resolve this manually. xcompnode.misc['ToDo'] = 'check-xcomp' - elif len(subjects) == 2 and len(advclchildren) == 1: - advclnode = advclchildren[0] - dn = [dist(node, x) for x in subjects] - dx = [dist(advclnode, x) for x in subjects] - # Is the first subject closer to advcl than it is to the current node? - # At the same time, is the second subject closer to the current node than it is to advcl? - if dx[0] < dn[0] and dn[1] < dx[1]: - # The first subject should be re-attached to the advcl node. - subjects[0].parent = advclnode - # Is the second subject closer to advcl than it is to the current node? - # At the same time, is the first subject closer to the current node than it is to advcl? - elif dx[1] < dn[1] and dn[0] < dx[0]: - # The second subject should be re-attached to the xcomp node. - subjects[1].parent = advclnode + elif len(subjects) == 2 and len(advclchildren) > 0: + for advclnode in advclchildren: + dn = [dist(node, x) for x in subjects] + dx = [dist(advclnode, x) for x in subjects] + # Is the first subject closer to advcl than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to advcl? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the advcl node. + subjects[0].parent = advclnode + break + # Is the second subject closer to advcl than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to advcl? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = advclnode + break def dist(x, y): d = x.ord - y.ord From 551f2ed23bbec82bcbed9c7780a07f1a2e65f02c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 17:01:27 +0200 Subject: [PATCH 240/871] A more lenient approach to catching the pattern. --- udapi/block/ud/da/fixmultisubject.py | 62 ++++++++++++++-------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index 453bc1c0..401c054a 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -22,36 +22,38 @@ def process_node(self, node): # Pattern 2: Similar to pattern 1, but advcl instead of xcomp, and # possibly not so many other mis-attached dependents. advclchildren = [x for x in node.children if x.udeprel == 'advcl'] - if len(subjects) == 2 and len(xcompchildren) == 1: - xcompnode = xcompchildren[0] - dn = [dist(node, x) for x in subjects] - dx = [dist(xcompnode, x) for x in subjects] - # Is the first subject closer to xcomp than it is to the current node? - # At the same time, is the second subject closer to the current node than it is to xcomp? - if dx[0] < dn[0] and dn[1] < dx[1]: - # The first subject should be re-attached to the xcomp node. - subjects[0].parent = xcompnode - # There are typically other dependents that should belong to the xcomp node. - for c in node.children: - if c != xcompnode and dist(xcompnode, c) < dist(node, c): - c.parent = xcompnode - # The xcompnode should probably be attached as something else - # than xcomp, perhaps even the direction of the relation should - # be reversed, but one would have to resolve this manually. - xcompnode.misc['ToDo'] = 'check-xcomp' - # Is the second subject closer to xcomp than it is to the current node? - # At the same time, is the first subject closer to the current node than it is to xcomp? - elif dx[1] < dn[1] and dn[0] < dx[0]: - # The second subject should be re-attached to the xcomp node. - subjects[1].parent = xcompnode - # There are typically other dependents that should belong to the xcomp node. - for c in node.children: - if c != xcompnode and dist(xcompnode, c) < dist(node, c): - c.parent = xcompnode - # The xcompnode should probably be attached as something else - # than xcomp, perhaps even the direction of the relation should - # be reversed, but one would have to resolve this manually. - xcompnode.misc['ToDo'] = 'check-xcomp' + if len(subjects) == 2 and len(xcompchildren) > 0: + for xcompnode in xcompchildren: + dn = [dist(node, x) for x in subjects] + dx = [dist(xcompnode, x) for x in subjects] + # Is the first subject closer to xcomp than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to xcomp? + if dx[0] <= dn[0] and dn[1] <= dx[1]: + # The first subject should be re-attached to the xcomp node. + subjects[0].parent = xcompnode + # There are typically other dependents that should belong to the xcomp node. + for c in node.children: + if c != xcompnode and dist(xcompnode, c) < dist(node, c): + c.parent = xcompnode + # The xcompnode should probably be attached as something else + # than xcomp, perhaps even the direction of the relation should + # be reversed, but one would have to resolve this manually. + xcompnode.misc['ToDo'] = 'check-xcomp' + break + # Is the second subject closer to xcomp than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to xcomp? + elif dx[1] <= dn[1] and dn[0] <= dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = xcompnode + # There are typically other dependents that should belong to the xcomp node. + for c in node.children: + if c != xcompnode and dist(xcompnode, c) < dist(node, c): + c.parent = xcompnode + # The xcompnode should probably be attached as something else + # than xcomp, perhaps even the direction of the relation should + # be reversed, but one would have to resolve this manually. + xcompnode.misc['ToDo'] = 'check-xcomp' + break elif len(subjects) == 2 and len(advclchildren) > 0: for advclnode in advclchildren: dn = [dist(node, x) for x in subjects] From 1f48111035d1cda1ec678af3ca4574087b11023d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 17:10:35 +0200 Subject: [PATCH 241/871] Taking commas into account when assessing node distance. --- udapi/block/ud/da/fixmultisubject.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index 401c054a..a690d0ba 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -72,7 +72,20 @@ def process_node(self, node): break def dist(x, y): - d = x.ord - y.ord - if d < 0: - d = -d + if x.ord < y.ord: + a = x + b = y + else: + a = y + b = x + d = b.ord - a.ord + # Count the commas between the two nodes. A comma should be seen as increasing + # the distance of the nodes, that is, decreasing the probability that they + # are in the same clause. + nc = 0 + for i in a.root.descendants: + if i.ord > a.ord and i.ord < b.ord: + if i.form == ',': + nc += 1 + d += nc * 10 return d From a430ac7a458a7c169b6fd0ae2b996c4bb20d9db0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 17:22:54 +0200 Subject: [PATCH 242/871] New pattern: "amod" with a copula. --- udapi/block/ud/da/fixmultisubject.py | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index a690d0ba..8bd9333a 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -22,6 +22,10 @@ def process_node(self, node): # Pattern 2: Similar to pattern 1, but advcl instead of xcomp, and # possibly not so many other mis-attached dependents. advclchildren = [x for x in node.children if x.udeprel == 'advcl'] + # Pattern 3: Instead of xcomp or advcl, there is a simple amod + # (under a verb!), in fact an adjective with a copula that should + # have been advcl. + amodchildren = [x for x in node.children if x.udeprel == 'amod'] if len(subjects) == 2 and len(xcompchildren) > 0: for xcompnode in xcompchildren: dn = [dist(node, x) for x in subjects] @@ -70,6 +74,32 @@ def process_node(self, node): # The second subject should be re-attached to the xcomp node. subjects[1].parent = advclnode break + elif len(subjects) == 2 and len(amodchildren) > 0: + for amodnode in amodchildren: + dn = [dist(node, x) for x in subjects] + dx = [dist(amodnode, x) for x in subjects] + # Is the first subject closer to amod than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to amod? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the advcl node. + subjects[0].parent = amodnode + amodnode.deprel = 'advcl' + # There are typically other dependents that should belong to the amod node. + for c in node.children: + if c != amodnode and dist(amodnode, c) < dist(node, c): + c.parent = amodnode + break + # Is the second subject closer to amod than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to amod? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = amodnode + amodnode.deprel = 'advcl' + # There are typically other dependents that should belong to the amod node. + for c in node.children: + if c != amodnode and dist(amodnode, c) < dist(node, c): + c.parent = amodnode + break def dist(x, y): if x.ord < y.ord: From 8589f3b64bcc070030a243186027354fe96682b9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 17:28:46 +0200 Subject: [PATCH 243/871] Nouns with copula are "obl" not "amod". --- udapi/block/ud/da/fixmultisubject.py | 54 ++++++++++++++-------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index 8bd9333a..e9367d46 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -24,8 +24,9 @@ def process_node(self, node): advclchildren = [x for x in node.children if x.udeprel == 'advcl'] # Pattern 3: Instead of xcomp or advcl, there is a simple amod # (under a verb!), in fact an adjective with a copula that should - # have been advcl. - amodchildren = [x for x in node.children if x.udeprel == 'amod'] + # have been advcl. Alternatively, the nonverbal clause is headed + # by a noun, and the deprel is obl instead of amod. + amodchildren = [x for x in node.children if re.match(r'^(amod|obl)$', x.udeprel)] if len(subjects) == 2 and len(xcompchildren) > 0: for xcompnode in xcompchildren: dn = [dist(node, x) for x in subjects] @@ -76,30 +77,31 @@ def process_node(self, node): break elif len(subjects) == 2 and len(amodchildren) > 0: for amodnode in amodchildren: - dn = [dist(node, x) for x in subjects] - dx = [dist(amodnode, x) for x in subjects] - # Is the first subject closer to amod than it is to the current node? - # At the same time, is the second subject closer to the current node than it is to amod? - if dx[0] < dn[0] and dn[1] < dx[1]: - # The first subject should be re-attached to the advcl node. - subjects[0].parent = amodnode - amodnode.deprel = 'advcl' - # There are typically other dependents that should belong to the amod node. - for c in node.children: - if c != amodnode and dist(amodnode, c) < dist(node, c): - c.parent = amodnode - break - # Is the second subject closer to amod than it is to the current node? - # At the same time, is the first subject closer to the current node than it is to amod? - elif dx[1] < dn[1] and dn[0] < dx[0]: - # The second subject should be re-attached to the xcomp node. - subjects[1].parent = amodnode - amodnode.deprel = 'advcl' - # There are typically other dependents that should belong to the amod node. - for c in node.children: - if c != amodnode and dist(amodnode, c) < dist(node, c): - c.parent = amodnode - break + if len([x for x in amodnode.children if x.udeprel == 'cop']) > 0: + dn = [dist(node, x) for x in subjects] + dx = [dist(amodnode, x) for x in subjects] + # Is the first subject closer to amod than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to amod? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the advcl node. + subjects[0].parent = amodnode + amodnode.deprel = 'advcl' + # There are typically other dependents that should belong to the amod node. + for c in node.children: + if c != amodnode and dist(amodnode, c) < dist(node, c): + c.parent = amodnode + break + # Is the second subject closer to amod than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to amod? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = amodnode + amodnode.deprel = 'advcl' + # There are typically other dependents that should belong to the amod node. + for c in node.children: + if c != amodnode and dist(amodnode, c) < dist(node, c): + c.parent = amodnode + break def dist(x, y): if x.ord < y.ord: From dcc759cebef6d2b2343f64a65524b60e182a0570 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 1 Aug 2022 01:33:22 +0200 Subject: [PATCH 244/871] bugfix in create_block _import_blocks now returns pairs (name, instance), but we need just the instance --- udapi/core/basewriter.py | 2 +- udapi/core/run.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index cc72c6e7..cdc2c38f 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -66,7 +66,7 @@ def before_process_document(self, document): logging.info('Writing to file %s.', docname) sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: - logging.warning('overwrite=1 but documet.meta["loaded_from"] is None') + logging.warning('overwrite=1 but document.meta["loaded_from"] is None') else: sys.stdout = self.orig_stdout else: diff --git a/udapi/core/run.py b/udapi/core/run.py index c3a4ca6f..a0cc4a9a 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -176,4 +176,4 @@ def scenario_string(self): def create_block(block, **kwargs): """A factory function for creating new block instances (handy for IPython).""" blocks = _import_blocks([block], [kwargs]) - return blocks[0] + return blocks[0][1] From d08d549ff8b04abbd3645bb256a0279f97b1b625 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 16 Aug 2022 22:44:42 +0200 Subject: [PATCH 245/871] Spurious auxiliaries in Hindi and Urdu. --- udapi/block/ud/hi/fixaux.py | 45 +++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 udapi/block/ud/hi/fixaux.py diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py new file mode 100644 index 00000000..c561d4ce --- /dev/null +++ b/udapi/block/ud/hi/fixaux.py @@ -0,0 +1,45 @@ +""" +Block to fix annotation of verbs that are currently treated as auxiliaries +but they should be treated as normal verbs instead. +""" +from udapi.core.block import Block +import logging +import re + +class FixAux(Block): + + def process_node(self, node): + # The following verbs appear in verb-verb compounds as the semantically + # less salient element: le (to take), de (to give), ḍāla (to throw), + # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come). There + # are also jā (to go) and paṛa (to fall) but we do not list them here + # because they can also act as genuine auxiliaries. + hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ'] + urcompound = ['لے', 'دے', 'بیٹھ', 'رکھ', 'آ'] + recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' + hiphase = ['लग', 'चुक'] + urphase = ['لگ', 'چک'] + rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' + if re.match(recompound, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': + node.deprel = 'compound' + # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX. + node.upos = "VERB" + elif re.match(rephase, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': + secpred = node.parent + grandparent = secpred.parent + node.parent = grandparent + node.deprel = secpred.deprel + secpred.parent = node + secpred.deprel = "xcomp" + ###!!! We should also take care of DEPS if they exist. + # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX. + node.upos = "VERB" + # Examine the children of the original parent. + # Those that modify the clause should be re-attached to me. + # Those that modify the word (noun, adjective) should stay there. + for c in secpred.children: + # obl is borderline. It could modify an adjective rather than a clause. + # obj and iobj should not occur in copular clauses but it sometimes + # occurs with pseudocopulas: "I declare him handsome." + if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): + c.parent = node From 562f840ac00be75f48fbd52f6b6dab0178af13e7 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 11:33:37 +0200 Subject: [PATCH 246/871] Fix lemma before fixing auxiliary. --- udapi/block/ud/hi/fixaux.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index c561d4ce..54a9bd83 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -9,6 +9,7 @@ class FixAux(Block): def process_node(self, node): + self.fix_lemma(node) # The following verbs appear in verb-verb compounds as the semantically # less salient element: le (to take), de (to give), ḍāla (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come). There @@ -17,6 +18,7 @@ def process_node(self, node): hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ'] urcompound = ['لے', 'دے', 'بیٹھ', 'رکھ', 'آ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' + # Control and raising verbs. hiphase = ['लग', 'चुक'] urphase = ['لگ', 'چک'] rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' @@ -43,3 +45,15 @@ def process_node(self, node): # occurs with pseudocopulas: "I declare him handsome." if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): c.parent = node + + def fix_lemma(self, node): + """ + Some verbal forms have wrong lemmas in the Hindi/Urdu treebanks. If they + are tagged AUX, it means that either the validator fails to recognize a + correct auxiliary, or we fail here to recognize a spurious auxiliary that + must be fixed. + """ + if node.upos == 'AUX': + # لگا is a perfective participle of لگنا (lagnā) "to seem, to appear" + if node.lemma == 'لگا': + node.lemma = 'لگ' From 1a8e7c721fe67f9fcdee86938e65ef185e745c1c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 11:46:53 +0200 Subject: [PATCH 247/871] =?UTF-8?q?=DA=86=D8=A7=DB=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 54a9bd83..f6507840 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -54,6 +54,12 @@ def fix_lemma(self, node): must be fixed. """ if node.upos == 'AUX': + # چاہ is a wrong lemmatization of چاہتی, which is a wrong spelling of چاہیئے (cāhie) "should" + if node.lemma == 'چاہ': + node.lemma = 'چاہیئے' + if node.form == 'چاہتی': + node.feats['Typo'] = 'Yes' + node.misc['CorrectForm'] = 'چاہیئے' # لگا is a perfective participle of لگنا (lagnā) "to seem, to appear" if node.lemma == 'لگا': node.lemma = 'لگ' From 7aae1ba2f90630e1fcf84c2c33208c61ed1f726d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 11:58:16 +0200 Subject: [PATCH 248/871] =?UTF-8?q?=DA=86=D8=A7=DB=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index f6507840..e390dc72 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -19,8 +19,10 @@ def process_node(self, node): urcompound = ['لے', 'دے', 'بیٹھ', 'رکھ', 'آ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. - hiphase = ['लग', 'चुक'] - urphase = ['لگ', 'چک'] + # چاہنا चाहना (cāhnā) "to want, to wish" is a control verb but not an auxiliary. + # Its form چاہیئے (cāhie) "should, ought to" (literally "is wanted"?) is treated as a separate, derived word, and it is a modal auxiliary. + hiphase = ['लग', 'चुक', 'चाह'] + urphase = ['لگ', 'چک', 'چاہ'] rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' if re.match(recompound, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': node.deprel = 'compound' @@ -54,12 +56,6 @@ def fix_lemma(self, node): must be fixed. """ if node.upos == 'AUX': - # چاہ is a wrong lemmatization of چاہتی, which is a wrong spelling of چاہیئے (cāhie) "should" - if node.lemma == 'چاہ': - node.lemma = 'چاہیئے' - if node.form == 'چاہتی': - node.feats['Typo'] = 'Yes' - node.misc['CorrectForm'] = 'چاہیئے' # لگا is a perfective participle of لگنا (lagnā) "to seem, to appear" if node.lemma == 'لگا': node.lemma = 'لگ' From 47a7ef38c2c4c957b000743e8eed37681debf0f1 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 12:09:53 +0200 Subject: [PATCH 249/871] =?UTF-8?q?=E0=A4=A6=E0=A4=BF=E0=A4=96=E0=A4=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index e390dc72..f4cce0e4 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -19,10 +19,11 @@ def process_node(self, node): urcompound = ['لے', 'دے', 'بیٹھ', 'رکھ', 'آ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. - # چاہنا चाहना (cāhnā) "to want, to wish" is a control verb but not an auxiliary. - # Its form چاہیئے (cāhie) "should, ought to" (literally "is wanted"?) is treated as a separate, derived word, and it is a modal auxiliary. - hiphase = ['लग', 'चुक', 'चाह'] - urphase = ['لگ', 'چک', 'چاہ'] + # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. + # Its form چاہیئے (cāhie) “should, ought to” (literally "is wanted"?) is treated as a separate, derived word, and it is a modal auxiliary. + # دکھانا दिखाना (dikhānā) “to show” + hiphase = ['लग', 'चुक', 'चाह', 'दिखा'] + urphase = ['لگ', 'چک', 'چاہ', 'دکھا'] rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' if re.match(recompound, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': node.deprel = 'compound' From 5aff16b435f5baa0ecb1bb830fa57ae6e980c109 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 12:22:58 +0200 Subject: [PATCH 250/871] ... kar ke --- udapi/block/ud/hi/fixaux.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index f4cce0e4..a1033d32 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -60,3 +60,8 @@ def fix_lemma(self, node): # لگا is a perfective participle of لگنا (lagnā) "to seem, to appear" if node.lemma == 'لگا': node.lemma = 'لگ' + # The postposition ke after a verbal stem is not an auxiliary. + # Example: علحدہ علحدہ کیس رجسٹر کر کے “by registering separate cases” + if node.lemma == 'کا' and node.form == 'کے': + node.upos = 'ADP' + node.deprel = 'mark' From 8acbf13ece8b280aee1ec785f61cf96f93fc3918 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 13:16:00 +0200 Subject: [PATCH 251/871] =?UTF-8?q?=DA=86=D8=A7=DB=81=D8=A6=DB=92?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index a1033d32..8f484546 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -57,6 +57,9 @@ def fix_lemma(self, node): must be fixed. """ if node.upos == 'AUX': + # چاہئے (cāhie) “should, ought to” occurs with alternative spellings (should they also be labeled as typos?) + if node.form == 'چاہئے' or node.form == 'چاہیئے' or node.form == 'چاہیے': + node.lemma = 'چاہئے' # لگا is a perfective participle of لگنا (lagnā) "to seem, to appear" if node.lemma == 'لگا': node.lemma = 'لگ' From 2db0c9e11cb9eba23aacbe9142ae7123851a44fc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 13:26:54 +0200 Subject: [PATCH 252/871] Plural of cahie. --- udapi/block/ud/hi/fixaux.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 8f484546..4967e9f9 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -60,6 +60,9 @@ def fix_lemma(self, node): # چاہئے (cāhie) “should, ought to” occurs with alternative spellings (should they also be labeled as typos?) if node.form == 'چاہئے' or node.form == 'چاہیئے' or node.form == 'چاہیے': node.lemma = 'چاہئے' + if node.form == 'چاہئیں': + node.lemma = 'چاہئے' + node.feats['Number'] = 'Plur' # لگا is a perfective participle of لگنا (lagnā) "to seem, to appear" if node.lemma == 'لگا': node.lemma = 'لگ' From d7241a18b7b5d866bb7a72df95be7dcf6deb8bd5 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 13:35:03 +0200 Subject: [PATCH 253/871] =?UTF-8?q?=DA=AF=DB=8C=D8=A7=20is=20a=20perfectiv?= =?UTF-8?q?e=20participle=20of=20=D8=AC=D8=A7=D9=86=D8=A7=E2=80=8E=20(j?= =?UTF-8?q?=C4=81n=C4=81)=20=E2=80=9Cto=20go=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 4967e9f9..5fa11356 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -63,7 +63,10 @@ def fix_lemma(self, node): if node.form == 'چاہئیں': node.lemma = 'چاہئے' node.feats['Number'] = 'Plur' - # لگا is a perfective participle of لگنا (lagnā) "to seem, to appear" + # گیا is a perfective participle of جانا‎ (jānā) “to go” + if node.lemma == 'گیا': + node.lemma = 'جا' + # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': node.lemma = 'لگ' # The postposition ke after a verbal stem is not an auxiliary. From 027fe04bb6f4195070fc536d3a26770b1b38197a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 14:50:00 +0200 Subject: [PATCH 254/871] =?UTF-8?q?Urdu=20pseudo-auxiliary=20=D9=BE=DA=BE?= =?UTF-8?q?=DB=8C=D9=86=DA=A9=20(phenk).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 5fa11356..a04ed04f 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -11,12 +11,12 @@ class FixAux(Block): def process_node(self, node): self.fix_lemma(node) # The following verbs appear in verb-verb compounds as the semantically - # less salient element: le (to take), de (to give), ḍāla (to throw), + # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come). There # are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ'] - urcompound = ['لے', 'دے', 'بیٹھ', 'رکھ', 'آ'] + urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'رکھ', 'آ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. From 5a9f2b875c5195f78d8cd6a5e6ef9ff42d7d572e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 15:15:22 +0200 Subject: [PATCH 255/871] =?UTF-8?q?=E0=A4=B5=E0=A4=BE=E0=A4=B2=E0=A4=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index a04ed04f..7b3b9c23 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -29,6 +29,13 @@ def process_node(self, node): node.deprel = 'compound' # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX. node.upos = "VERB" + # والا (vālā) with infinitive is annotated as auxiliary but it should not. + # It is not even a verb (it does not have a verbal paradigm); it is more + # like an adjective morphologically, and like a noun syntactically. It means + # “the one who does the action of the content verb infinitive.” + elif node.lemma == 'वाला' or node.lemma == 'والا': + node.upos = 'ADJ' + node.deprel = 'compound' elif re.match(rephase, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': secpred = node.parent grandparent = secpred.parent From c1c3518d872c265391c6efdcbf35a7829c8cfe99 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 15:21:02 +0200 Subject: [PATCH 256/871] =?UTF-8?q?If=20v=C3=A1l=C3=A1=20is=20no=20longer?= =?UTF-8?q?=20adposition,=20it=20cannot=20have=20AdpType.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 7b3b9c23..e843dd6f 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -29,12 +29,15 @@ def process_node(self, node): node.deprel = 'compound' # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX. node.upos = "VERB" - # والا (vālā) with infinitive is annotated as auxiliary but it should not. + # वाला والا (vālā) with infinitive is annotated as auxiliary but it should not. # It is not even a verb (it does not have a verbal paradigm); it is more # like an adjective morphologically, and like a noun syntactically. It means # “the one who does the action of the content verb infinitive.” + # Some occurrences in the original annotation are case or mark, so we do not + # check AUX/aux here. elif node.lemma == 'वाला' or node.lemma == 'والا': node.upos = 'ADJ' + node.feats['AdpType'] = '' node.deprel = 'compound' elif re.match(rephase, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': secpred = node.parent From 0e5f4909850ac14097ec1a70c625275e3402bd35 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 15:33:25 +0200 Subject: [PATCH 257/871] =?UTF-8?q?Features=20of=20v=C4=81l=C4=81.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index e843dd6f..34566e06 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -38,6 +38,8 @@ def process_node(self, node): elif node.lemma == 'वाला' or node.lemma == 'والا': node.upos = 'ADJ' node.feats['AdpType'] = '' + node.feats['VerbForm'] = '' + node.feats['Aspect'] = '' node.deprel = 'compound' elif re.match(rephase, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': secpred = node.parent From 783246b8c556823f7632fb0114ce4e0edbf2af94 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 15:42:37 +0200 Subject: [PATCH 258/871] =?UTF-8?q?=D8=B3=DA=A9=DB=92?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 34566e06..b18c074f 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -81,6 +81,8 @@ def fix_lemma(self, node): # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': node.lemma = 'لگ' + if node.lemma == 'سکے': + node.lemma = 'سک' # The postposition ke after a verbal stem is not an auxiliary. # Example: علحدہ علحدہ کیس رجسٹر کر کے “by registering separate cases” if node.lemma == 'کا' and node.form == 'کے': From d357827545a125cb45e21d99670c309c30b85cd1 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 16:08:49 +0200 Subject: [PATCH 259/871] =?UTF-8?q?=D8=A8=D9=86=D8=A7=20is=20not=20the=20l?= =?UTF-8?q?emma=20of=20=DA=A9=D8=B1.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index b18c074f..75e7f9bb 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -69,6 +69,10 @@ def fix_lemma(self, node): must be fixed. """ if node.upos == 'AUX': + # بنانا बनाना “make, create, produce, cause to be/become” + # (I don't know why in some instances بنا was used as lemma for کر “to do”.) + if node.form == 'کر' and node.lemma == 'بنا': + node.lemma = 'کر' # چاہئے (cāhie) “should, ought to” occurs with alternative spellings (should they also be labeled as typos?) if node.form == 'چاہئے' or node.form == 'چاہیئے' or node.form == 'چاہیے': node.lemma = 'چاہئے' From 646b0c47d64c0977b2354a75e10bb4596538e941 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 20:19:00 +0200 Subject: [PATCH 260/871] =?UTF-8?q?=DA=86=DA=A9=D8=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 75e7f9bb..9d59195d 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -79,6 +79,9 @@ def fix_lemma(self, node): if node.form == 'چاہئیں': node.lemma = 'چاہئے' node.feats['Number'] = 'Plur' + # چکا is a perfective participle of چکنا (cuknā) “to be finished” + if node.lemma == 'چکا': + node.lemma = 'چک' # گیا is a perfective participle of جانا‎ (jānā) “to go” if node.lemma == 'گیا': node.lemma = 'جا' From fc741a6edaabc0bdac2fd0a65293c7b561843519 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 20:27:20 +0200 Subject: [PATCH 261/871] =?UTF-8?q?=D8=B1=DB=81=D8=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 9d59195d..7b3de989 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -88,6 +88,10 @@ def fix_lemma(self, node): # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': node.lemma = 'لگ' + # رہا is a perfective participle of رہنا (rahnā) “to stay” + if node.lemma == 'رہا': + node.lemma = 'رہ' + # sakna to be able to if node.lemma == 'سکے': node.lemma = 'سک' # The postposition ke after a verbal stem is not an auxiliary. From 2848129c7eb7ee0d3f081181b48949acbded312f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 22:40:14 +0200 Subject: [PATCH 262/871] =?UTF-8?q?Lemmatization=20of=20=D9=88=D8=A7=D9=84?= =?UTF-8?q?=DB=8C.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 7b3de989..0c811365 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -94,6 +94,10 @@ def fix_lemma(self, node): # sakna to be able to if node.lemma == 'سکے': node.lemma = 'سک' + # The compound part vālā is not an auxiliary. We handle it in process_node() + # but it must be lemmatized properly. + if node.lemma == 'والی': + node.lemma = 'والا' # The postposition ke after a verbal stem is not an auxiliary. # Example: علحدہ علحدہ کیس رجسٹر کر کے “by registering separate cases” if node.lemma == 'کا' and node.form == 'کے': From 51f6f1614b451969aeff200138cdfef8afcad1ce Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 22:52:20 +0200 Subject: [PATCH 263/871] =?UTF-8?q?Lemmatization=20of=20=DA=AF=D8=A7,=20?= =?UTF-8?q?=DA=AF=DB=8C,=20=DA=AF=DB=92.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 0c811365..da5d6b42 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -82,6 +82,12 @@ def fix_lemma(self, node): # چکا is a perfective participle of چکنا (cuknā) “to be finished” if node.lemma == 'چکا': node.lemma = 'چک' + # گا, گی, گے denote the future tense. They are written as separate + # words in Urdu (while they are just suffixes in Hindi). However, + # when written as a separate auxiliary, all these forms should share + # the same lemma. + if node.lemma == 'گی' or node.lemma = 'گے': + node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” if node.lemma == 'گیا': node.lemma = 'جا' From c11aa6703f0854a3f83238695656da106a88c74a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 22:53:31 +0200 Subject: [PATCH 264/871] Bug fix. --- udapi/block/ud/hi/fixaux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index da5d6b42..1843e2de 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -86,7 +86,7 @@ def fix_lemma(self, node): # words in Urdu (while they are just suffixes in Hindi). However, # when written as a separate auxiliary, all these forms should share # the same lemma. - if node.lemma == 'گی' or node.lemma = 'گے': + if node.lemma == 'گی' or node.lemma == 'گے': node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” if node.lemma == 'گیا': From 57dffd5625e94b369ac06410196ae7a3e7f3bd27 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 23:09:57 +0200 Subject: [PATCH 265/871] Lemmatization of lena, dena. --- udapi/block/ud/hi/fixaux.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 1843e2de..00141be4 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -82,6 +82,9 @@ def fix_lemma(self, node): # چکا is a perfective participle of چکنا (cuknā) “to be finished” if node.lemma == 'چکا': node.lemma = 'چک' + # دیا is a perfective participle of دینا (denā) “to give” + if node.lemma == 'دیا': + node.lemma = 'دے' # گا, گی, گے denote the future tense. They are written as separate # words in Urdu (while they are just suffixes in Hindi). However, # when written as a separate auxiliary, all these forms should share @@ -91,6 +94,9 @@ def fix_lemma(self, node): # گیا is a perfective participle of جانا‎ (jānā) “to go” if node.lemma == 'گیا': node.lemma = 'جا' + # لیا is a perfective participle of لینا (lenā) “to take” + if node.lemma == 'لیا': + node.lemma = 'لے' # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': node.lemma = 'لگ' From d6a66df056d57b0fe24ded3770aa182b35ca0a3f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 23:22:29 +0200 Subject: [PATCH 266/871] =?UTF-8?q?=D8=A7=D9=B9=DA=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 00141be4..28293ab6 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -16,7 +16,7 @@ def process_node(self, node): # are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ'] - urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'رکھ', 'آ'] + urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. @@ -69,6 +69,9 @@ def fix_lemma(self, node): must be fixed. """ if node.upos == 'AUX': + # اٹھنا “to rise, get up” + if node.lemma == 'اٹھا': + node.lemma = 'اٹھ' # بنانا बनाना “make, create, produce, cause to be/become” # (I don't know why in some instances بنا was used as lemma for کر “to do”.) if node.form == 'کر' and node.lemma == 'بنا': From 160d20df833612ccbb52b7b1a4586467b9051bb1 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 09:58:18 +0200 Subject: [PATCH 267/871] =?UTF-8?q?=D8=AF=DB=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 28293ab6..5a4351e1 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -86,7 +86,7 @@ def fix_lemma(self, node): if node.lemma == 'چکا': node.lemma = 'چک' # دیا is a perfective participle of دینا (denā) “to give” - if node.lemma == 'دیا': + if node.lemma == 'دیا' or node.lemma == 'دی': node.lemma = 'دے' # گا, گی, گے denote the future tense. They are written as separate # words in Urdu (while they are just suffixes in Hindi). However, From eab5615969dbe4bd7b5349ad3a58f3ced14f19ee Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 10:07:29 +0200 Subject: [PATCH 268/871] =?UTF-8?q?Wrongly=20lemmatized=20forms=20of=20?= =?UTF-8?q?=D8=AC=D8=A7=D9=86=D8=A7=E2=80=8E=20(j=C4=81n=C4=81)=20?= =?UTF-8?q?=E2=80=9Cto=20go=E2=80=9D.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 5a4351e1..b6b68ceb 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -95,7 +95,7 @@ def fix_lemma(self, node): if node.lemma == 'گی' or node.lemma == 'گے': node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” - if node.lemma == 'گیا': + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی': node.lemma = 'جا' # لیا is a perfective participle of لینا (lenā) “to take” if node.lemma == 'لیا': From 18dc8b44dd288f8b125467144d56c1447c135f60 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 10:18:36 +0200 Subject: [PATCH 269/871] =?UTF-8?q?Wrongly=20lemmatized=20forms=20of=20?= =?UTF-8?q?=D8=AC=D8=A7=D9=86=D8=A7=E2=80=8E=20(j=C4=81n=C4=81)=20?= =?UTF-8?q?=E2=80=9Cto=20go=E2=80=9D.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index b6b68ceb..81f4653b 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -95,7 +95,8 @@ def fix_lemma(self, node): if node.lemma == 'گی' or node.lemma == 'گے': node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” - if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی': + # جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان': node.lemma = 'جا' # لیا is a perfective participle of لینا (lenā) “to take” if node.lemma == 'لیا': From bd1f23a3c60e9c8e0bb1ccef87851fbafb6a4fa3 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 10:26:57 +0200 Subject: [PATCH 270/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 81f4653b..3b7e2f02 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -82,6 +82,9 @@ def fix_lemma(self, node): if node.form == 'چاہئیں': node.lemma = 'چاہئے' node.feats['Number'] = 'Plur' + # چاہے seems to be a wrong lemma of چاہیں_گے “would like” + if node.lemma == 'چاہے': + node.lemma = 'چاہ' # چکا is a perfective participle of چکنا (cuknā) “to be finished” if node.lemma == 'چکا': node.lemma = 'چک' @@ -108,7 +111,7 @@ def fix_lemma(self, node): if node.lemma == 'رہا': node.lemma = 'رہ' # sakna to be able to - if node.lemma == 'سکے': + if node.lemma == 'سکے' or node.lemma == 'سکی': node.lemma = 'سک' # The compound part vālā is not an auxiliary. We handle it in process_node() # but it must be lemmatized properly. From 8b23ecc0f6c2e58777bd70e2b4c11c4c76fba58d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 10:32:59 +0200 Subject: [PATCH 271/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 3b7e2f02..ecf7c3b2 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -99,7 +99,7 @@ def fix_lemma(self, node): node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” # جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. - if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان': + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا': node.lemma = 'جا' # لیا is a perfective participle of لینا (lenā) “to take” if node.lemma == 'لیا': @@ -108,10 +108,10 @@ def fix_lemma(self, node): if node.lemma == 'لگا': node.lemma = 'لگ' # رہا is a perfective participle of رہنا (rahnā) “to stay” - if node.lemma == 'رہا': + if node.lemma == 'رہا' or node.lemma == 'رہے': node.lemma = 'رہ' # sakna to be able to - if node.lemma == 'سکے' or node.lemma == 'سکی': + if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا': node.lemma = 'سک' # The compound part vālā is not an auxiliary. We handle it in process_node() # but it must be lemmatized properly. From 356a7cda698ec00b2c0fc594c3b3157a05c50300 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 10:39:13 +0200 Subject: [PATCH 272/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index ecf7c3b2..f320bb98 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -12,11 +12,11 @@ def process_node(self, node): self.fix_lemma(node) # The following verbs appear in verb-verb compounds as the semantically # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), - # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come). There - # are also jā (to go) and paṛa (to fall) but we do not list them here + # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring). + # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ'] - urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ'] + urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. @@ -99,7 +99,7 @@ def fix_lemma(self, node): node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” # جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. - if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا': + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ': node.lemma = 'جا' # لیا is a perfective participle of لینا (lenā) “to take” if node.lemma == 'لیا': From f8e8a0aac7e6dfb52f2ff256d1f4c4fa95a6f271 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 10:48:40 +0200 Subject: [PATCH 273/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index f320bb98..0a7f8d3a 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -12,11 +12,12 @@ def process_node(self, node): self.fix_lemma(node) # The following verbs appear in verb-verb compounds as the semantically # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), - # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring). + # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), + # pahuñc (to reach). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. - hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ'] - urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا'] + hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच'] + urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. @@ -102,7 +103,7 @@ def fix_lemma(self, node): if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ': node.lemma = 'جا' # لیا is a perfective participle of لینا (lenā) “to take” - if node.lemma == 'لیا': + if node.lemma == 'لیا' or node.lemma == 'لو': node.lemma = 'لے' # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': From 04068d58a6b5488691079f5538ac122c084925ec Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 10:54:52 +0200 Subject: [PATCH 274/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 0a7f8d3a..2ce70d6a 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -102,8 +102,11 @@ def fix_lemma(self, node): # جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ': node.lemma = 'جا' + # Wrongly lemmatized present forms of “to be”. + if node.lemma == 'ہوں' or node.lemma == 'ہوا': + node.lemma = 'ہے' # لیا is a perfective participle of لینا (lenā) “to take” - if node.lemma == 'لیا' or node.lemma == 'لو': + if node.lemma == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی': node.lemma = 'لے' # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': From d0275bec69254fc9b3b910584d10d13d4052e32c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:01:58 +0200 Subject: [PATCH 275/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 2ce70d6a..2178e421 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -111,8 +111,11 @@ def fix_lemma(self, node): # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': node.lemma = 'لگ' + # پڑے is a perfective participle of پڑنا (paṛnā) “to fall” + if node.lemma == 'پڑے': + node.lemma = 'پڑ' # رہا is a perfective participle of رہنا (rahnā) “to stay” - if node.lemma == 'رہا' or node.lemma == 'رہے': + if node.lemma == 'رہا' or node.lemma == 'رہی' or node.lemma == 'رہے': node.lemma = 'رہ' # sakna to be able to if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا': From 8d6866b473195bf97d79a9b25fd9b21984541e62 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:08:26 +0200 Subject: [PATCH 276/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 2178e421..dcf6fe3a 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -13,11 +13,11 @@ def process_node(self, node): # The following verbs appear in verb-verb compounds as the semantically # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), - # pahuñc (to reach). + # pahuñc (to reach), dekh (to look). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच'] - urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ'] + urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. @@ -100,13 +100,13 @@ def fix_lemma(self, node): node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” # جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. - if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ': + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ' or node.lemma == 'جائی': node.lemma = 'جا' # Wrongly lemmatized present forms of “to be”. if node.lemma == 'ہوں' or node.lemma == 'ہوا': node.lemma = 'ہے' # لیا is a perfective participle of لینا (lenā) “to take” - if node.lemma == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی': + if node.lemma == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی' or node.lemma == 'لیجیے': node.lemma = 'لے' # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': From 86248be9f7d975d6d96a2daab5208b2175a1762c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:15:02 +0200 Subject: [PATCH 277/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index dcf6fe3a..84a4b9c1 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -120,6 +120,9 @@ def fix_lemma(self, node): # sakna to be able to if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا': node.lemma = 'سک' + # Wrongly lemmatized past forms of “to be”. + if node.lemma == 'تھ' or node.lemma == 'تھے' or node.lemma == 'تھیں': + node.lemma = 'تھا' # The compound part vālā is not an auxiliary. We handle it in process_node() # but it must be lemmatized properly. if node.lemma == 'والی': From 6a726c962590fd03e9bf56096853e80e0560d319 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:21:29 +0200 Subject: [PATCH 278/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 84a4b9c1..ea865b14 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -70,6 +70,9 @@ def fix_lemma(self, node): must be fixed. """ if node.upos == 'AUX': + # آنے is the oblique infinitive form of “to come” + if node.lemma == 'آنہ': + node.lemma = 'آ' # اٹھنا “to rise, get up” if node.lemma == 'اٹھا': node.lemma = 'اٹھ' @@ -90,7 +93,7 @@ def fix_lemma(self, node): if node.lemma == 'چکا': node.lemma = 'چک' # دیا is a perfective participle of دینا (denā) “to give” - if node.lemma == 'دیا' or node.lemma == 'دی': + if node.lemma == 'دیا' or node.lemma == 'دی' or node.lemma == 'دیت': node.lemma = 'دے' # گا, گی, گے denote the future tense. They are written as separate # words in Urdu (while they are just suffixes in Hindi). However, From 0cd5546c5397b4055797d3f6e9e09f3f67f0c7ef Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:29:57 +0200 Subject: [PATCH 279/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index ea865b14..df914996 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -13,11 +13,11 @@ def process_node(self, node): # The following verbs appear in verb-verb compounds as the semantically # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), - # pahuñc (to reach), dekh (to look). + # pahuñc (to reach), dekh (to look), phar (to return). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच'] - urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ'] + urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. @@ -117,6 +117,9 @@ def fix_lemma(self, node): # پڑے is a perfective participle of پڑنا (paṛnā) “to fall” if node.lemma == 'پڑے': node.lemma = 'پڑ' + # پھرے is a perfective participle of پھرنا (pharnā) “to return” + if node.lemma == 'پھرے': + node.lemma = 'پھر' # رہا is a perfective participle of رہنا (rahnā) “to stay” if node.lemma == 'رہا' or node.lemma == 'رہی' or node.lemma == 'رہے': node.lemma = 'رہ' From 9046d0e4ed67f9dadd6088a55d14d390810e55d6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:40:44 +0200 Subject: [PATCH 280/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index df914996..1507e8e4 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -13,11 +13,11 @@ def process_node(self, node): # The following verbs appear in verb-verb compounds as the semantically # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), - # pahuñc (to reach), dekh (to look), phar (to return). + # pahuñc (to reach), dekh (to look), phar (to return), cal (to walk). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच'] - urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر'] + urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. @@ -103,7 +103,7 @@ def fix_lemma(self, node): node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” # جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. - if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ' or node.lemma == 'جائی': + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ' or node.lemma == 'جائی' or node.lemma == 'جاتے' or node.lemma == 'جات': node.lemma = 'جا' # Wrongly lemmatized present forms of “to be”. if node.lemma == 'ہوں' or node.lemma == 'ہوا': From 27321a9cae361be1e251f3ffe165055c2a591ac3 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:48:48 +0200 Subject: [PATCH 281/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 1507e8e4..e6435d09 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -17,7 +17,7 @@ def process_node(self, node): # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच'] - urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل'] + urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. @@ -124,7 +124,7 @@ def fix_lemma(self, node): if node.lemma == 'رہا' or node.lemma == 'رہی' or node.lemma == 'رہے': node.lemma = 'رہ' # sakna to be able to - if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا': + if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا' or node.lemma == 'سکت': node.lemma = 'سک' # Wrongly lemmatized past forms of “to be”. if node.lemma == 'تھ' or node.lemma == 'تھے' or node.lemma == 'تھیں': From 3c0e186abb780fdcf6ae18064c7ea18170cdc37b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:56:40 +0200 Subject: [PATCH 282/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index e6435d09..440bcd80 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -106,10 +106,12 @@ def fix_lemma(self, node): if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ' or node.lemma == 'جائی' or node.lemma == 'جاتے' or node.lemma == 'جات': node.lemma = 'جا' # Wrongly lemmatized present forms of “to be”. - if node.lemma == 'ہوں' or node.lemma == 'ہوا': + # In one instance, ہے had a lemma from a neighboring verb, so we also look at the form. + if node.lemma == 'ہوں' or node.lemma == 'ہوا' or node.form == 'ہے': node.lemma = 'ہے' # لیا is a perfective participle of لینا (lenā) “to take” - if node.lemma == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی' or node.lemma == 'لیجیے': + # In one instance, لیا had a lemma from a neighboring verb, so we also look at the form. + if node.lemma == 'لیا' or node.form == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی' or node.lemma == 'لیجیے': node.lemma = 'لے' # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': From 5c9c72eed8952fbd51b44963816e5215f2c51922 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 12:16:16 +0200 Subject: [PATCH 283/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 440bcd80..f6d9e7d7 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -95,6 +95,9 @@ def fix_lemma(self, node): # دیا is a perfective participle of دینا (denā) “to give” if node.lemma == 'دیا' or node.lemma == 'دی' or node.lemma == 'دیت': node.lemma = 'دے' + # دکھائیں (dikhānā) “to show” + if node.form == 'دکھائیں': + node.lemma = 'دکھا' # گا, گی, گے denote the future tense. They are written as separate # words in Urdu (while they are just suffixes in Hindi). However, # when written as a separate auxiliary, all these forms should share From f678f17af97ee046759b677d6c3b5cda1ab06451 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 12:27:02 +0200 Subject: [PATCH 284/871] Spurious semantic auxiliaries. --- udapi/block/ud/hi/fixaux.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index f6d9e7d7..1f2d670c 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -13,11 +13,12 @@ def process_node(self, node): # The following verbs appear in verb-verb compounds as the semantically # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), - # pahuñc (to reach), dekh (to look), phar (to return), cal (to walk). + # pahuñc (to reach), dekh (to look), phar (to return), cal (to walk), + # caṛh (to climb), saṛ (to rot). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच'] - urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل'] + urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. From bff1cecf97b947dc44a60a385209b9fca74a9bc2 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 12:53:16 +0200 Subject: [PATCH 285/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 1f2d670c..e6ec6c53 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -17,7 +17,7 @@ def process_node(self, node): # caṛh (to climb), saṛ (to rot). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. - hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच'] + hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल'] urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. @@ -90,7 +90,9 @@ def fix_lemma(self, node): # چاہے seems to be a wrong lemma of چاہیں_گے “would like” if node.lemma == 'چاہے': node.lemma = 'چاہ' - # چکا is a perfective participle of چکنا (cuknā) “to be finished” + # चुका چکا is a perfective participle of चुकना چکنا (cuknā) “to be finished” + if node.lemma == 'चुका': + node.lemma = 'चुक' if node.lemma == 'چکا': node.lemma = 'چک' # دیا is a perfective participle of دینا (denā) “to give” @@ -111,6 +113,8 @@ def fix_lemma(self, node): node.lemma = 'جا' # Wrongly lemmatized present forms of “to be”. # In one instance, ہے had a lemma from a neighboring verb, so we also look at the form. + if node.lemma == 'हों': + node.lemma = 'है' if node.lemma == 'ہوں' or node.lemma == 'ہوا' or node.form == 'ہے': node.lemma = 'ہے' # لیا is a perfective participle of لینا (lenā) “to take” From d956b903dbe9bb3d86079e579706bd9b91cf9a6d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 13:06:39 +0200 Subject: [PATCH 286/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index e6ec6c53..be0eaf02 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -14,10 +14,10 @@ def process_node(self, node): # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), # pahuñc (to reach), dekh (to look), phar (to return), cal (to walk), - # caṛh (to climb), saṛ (to rot). + # caṛh (to climb), saṛ (to rot), nikāl (to remove). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. - hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल'] + hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकाल'] urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. @@ -95,7 +95,9 @@ def fix_lemma(self, node): node.lemma = 'चुक' if node.lemma == 'چکا': node.lemma = 'چک' - # دیا is a perfective participle of دینا (denā) “to give” + # दिया دیا is a perfective participle of देना دینا (denā) “to give” + if node.lemma == 'दिया': + node.lemma = 'दे' if node.lemma == 'دیا' or node.lemma == 'دی' or node.lemma == 'دیت': node.lemma = 'دے' # دکھائیں (dikhānā) “to show” @@ -117,8 +119,10 @@ def fix_lemma(self, node): node.lemma = 'है' if node.lemma == 'ہوں' or node.lemma == 'ہوا' or node.form == 'ہے': node.lemma = 'ہے' - # لیا is a perfective participle of لینا (lenā) “to take” + # लिया لیا is a perfective participle of लेना لینا (lenā) “to take” # In one instance, لیا had a lemma from a neighboring verb, so we also look at the form. + if node.lemma == 'लिया': + node.lemma = 'ले' if node.lemma == 'لیا' or node.form == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی' or node.lemma == 'لیجیے': node.lemma = 'لے' # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” From 5308b9e5063d21b17ed2548111f54c2e81e8b1c6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 13:16:16 +0200 Subject: [PATCH 287/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index be0eaf02..16dfc0aa 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -109,8 +109,10 @@ def fix_lemma(self, node): # the same lemma. if node.lemma == 'گی' or node.lemma == 'گے': node.lemma = 'گا' - # گیا is a perfective participle of جانا‎ (jānā) “to go” - # جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. + # گیا is a perfective participle of जाना جانا‎ (jānā) “to go” + # जान جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. + if node.lemma == 'जाना' or node.lemma == 'जान': + node.lemma = 'जा' if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ' or node.lemma == 'جائی' or node.lemma == 'جاتے' or node.lemma == 'جات': node.lemma = 'جا' # Wrongly lemmatized present forms of “to be”. From 7484b1e55e5820abbb079cf19f8bc7013ae1f726 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 13:53:52 +0200 Subject: [PATCH 288/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 16dfc0aa..595f1725 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -14,10 +14,10 @@ def process_node(self, node): # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), # pahuñc (to reach), dekh (to look), phar (to return), cal (to walk), - # caṛh (to climb), saṛ (to rot), nikāl (to remove). + # caṛh (to climb), saṛ (to rot), nikāl (to remove), girā (to drop). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. - hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकाल'] + hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकाल', 'गिरा'] urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. @@ -136,13 +136,17 @@ def fix_lemma(self, node): # پھرے is a perfective participle of پھرنا (pharnā) “to return” if node.lemma == 'پھرے': node.lemma = 'پھر' - # رہا is a perfective participle of رہنا (rahnā) “to stay” + # रहा رہا is a perfective participle of रहना رہنا (rahnā) “to stay” + if node.lemma == 'रहा' or node.lemma == 'रहूं': + node.lemma = 'रह' if node.lemma == 'رہا' or node.lemma == 'رہی' or node.lemma == 'رہے': node.lemma = 'رہ' # sakna to be able to if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا' or node.lemma == 'سکت': node.lemma = 'سک' # Wrongly lemmatized past forms of “to be”. + if node.lemma == 'थी': + node.lemma = 'था' if node.lemma == 'تھ' or node.lemma == 'تھے' or node.lemma == 'تھیں': node.lemma = 'تھا' # The compound part vālā is not an auxiliary. We handle it in process_node() From 8f5e017e9ab0329fa9894ccde17d2e07d56a207b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 15:29:42 +0200 Subject: [PATCH 289/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 595f1725..041f5d44 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -127,7 +127,9 @@ def fix_lemma(self, node): node.lemma = 'ले' if node.lemma == 'لیا' or node.form == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی' or node.lemma == 'لیجیے': node.lemma = 'لے' - # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” + # लगा لگا is a perfective participle of लगना لگنا (lagnā) “to seem, to appear” + if node.lemma == 'लगा': + node.lemma = 'लग' if node.lemma == 'لگا': node.lemma = 'لگ' # پڑے is a perfective participle of پڑنا (paṛnā) “to fall” From 3341b087c67755c3fbe2b4724adcf1776421d453 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 15:47:29 +0200 Subject: [PATCH 290/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 041f5d44..feb622d9 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -74,9 +74,6 @@ def fix_lemma(self, node): # آنے is the oblique infinitive form of “to come” if node.lemma == 'آنہ': node.lemma = 'آ' - # اٹھنا “to rise, get up” - if node.lemma == 'اٹھا': - node.lemma = 'اٹھ' # بنانا बनाना “make, create, produce, cause to be/become” # (I don't know why in some instances بنا was used as lemma for کر “to do”.) if node.form == 'کر' and node.lemma == 'بنا': @@ -151,6 +148,11 @@ def fix_lemma(self, node): node.lemma = 'था' if node.lemma == 'تھ' or node.lemma == 'تھے' or node.lemma == 'تھیں': node.lemma = 'تھا' + # उठा اٹھا is a perfective participle of उठना اٹھنا (uṭhnā) “to rise, get up” + if node.lemma == 'उठा': + node.lemma = 'उठ' + if node.lemma == 'اٹھا': + node.lemma = 'اٹھ' # The compound part vālā is not an auxiliary. We handle it in process_node() # but it must be lemmatized properly. if node.lemma == 'والی': From 326e3d200f3efce29098efc47f73e73ecf34cc21 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 16:02:34 +0200 Subject: [PATCH 291/871] Spurious semantic auxiliaries. --- udapi/block/ud/hi/fixaux.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index feb622d9..fd0c77c0 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -14,18 +14,20 @@ def process_node(self, node): # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), # pahuñc (to reach), dekh (to look), phar (to return), cal (to walk), - # caṛh (to climb), saṛ (to rot), nikāl (to remove), girā (to drop). + # caṛh (to climb), saṛ (to rot), nikāl (to remove), girā (to drop), samā + # (to encounter), dhamaka (to bully). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. - hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकाल', 'गिरा'] + hicompound = ['ले', 'दे', 'डाल', 'फेंक', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकाल', 'गिरा', 'समा', 'धमक'] urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. - # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. + # चाहना چاہنا (cāhnā) “to want, to wish” is a control verb but not an auxiliary. # Its form چاہیئے (cāhie) “should, ought to” (literally "is wanted"?) is treated as a separate, derived word, and it is a modal auxiliary. - # دکھانا दिखाना (dikhānā) “to show” - hiphase = ['लग', 'चुक', 'चाह', 'दिखा'] - urphase = ['لگ', 'چک', 'چاہ', 'دکھا'] + # दिखाना دکھانا (dikhānā) “to show” + # बनना بننا (bananā) “to become” + hiphase = ['लग', 'चुक', 'चाह', 'दिखा', 'बन'] + urphase = ['لگ', 'چک', 'چاہ', 'دکھا', 'بن'] rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' if re.match(recompound, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': node.deprel = 'compound' From 67dc40eb1a41ca4fd0be6554c2577767cf532c77 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 16:21:46 +0200 Subject: [PATCH 292/871] Spurious semantic auxiliaries. --- udapi/block/ud/hi/fixaux.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index fd0c77c0..ec61592f 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -11,14 +11,15 @@ class FixAux(Block): def process_node(self, node): self.fix_lemma(node) # The following verbs appear in verb-verb compounds as the semantically - # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), + # less salient element: le (to take), de (to give), ḍāla / phenka (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), - # pahuñc (to reach), dekh (to look), phar (to return), cal (to walk), - # caṛh (to climb), saṛ (to rot), nikāl (to remove), girā (to drop), samā - # (to encounter), dhamaka (to bully). + # pahuñca (to reach), dekha (to look), phara (to return), cala (to walk), + # caṛha (to climb), saṛa (to rot), nikala (to get out), nikāla (to remove), girā (to drop), + # samā (to encounter), dhamaka (to bully), khaḍā (to stand), daboca (to catch), + # gujara (to pass). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. - hicompound = ['ले', 'दे', 'डाल', 'फेंक', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकाल', 'गिरा', 'समा', 'धमक'] + hicompound = ['ले', 'दे', 'डाल', 'फेंक', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकल', 'निकाल', 'गिरा', 'समा', 'धमक', 'खडा', 'दबोच', 'गुजर'] urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. @@ -116,7 +117,7 @@ def fix_lemma(self, node): node.lemma = 'جا' # Wrongly lemmatized present forms of “to be”. # In one instance, ہے had a lemma from a neighboring verb, so we also look at the form. - if node.lemma == 'हों': + if node.lemma == 'हों' or node.lemma == 'है.': node.lemma = 'है' if node.lemma == 'ہوں' or node.lemma == 'ہوا' or node.form == 'ہے': node.lemma = 'ہے' From 7ebadbeecf5a10f5a518987822f534f7ae4aa54c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 16:35:58 +0200 Subject: [PATCH 293/871] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index ec61592f..49518e05 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -132,6 +132,9 @@ def fix_lemma(self, node): node.lemma = 'लग' if node.lemma == 'لگا': node.lemma = 'لگ' + # पहुंचा پہنچا is a perfective participle of पहुंचना پہنچنا (pahuñcnā) “to reach” + if node.lemma == 'पहुंचा' or node.lemma == 'पहुँच': + node.lemma = 'पहुंच' # پڑے is a perfective participle of پڑنا (paṛnā) “to fall” if node.lemma == 'پڑے': node.lemma = 'پڑ' From 58bdbc12283ea253f33a12a3966b2166b82e463f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 16:49:07 +0200 Subject: [PATCH 294/871] Spurious semantic auxiliaries. --- udapi/block/ud/hi/fixaux.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 49518e05..004ab4af 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -16,10 +16,10 @@ def process_node(self, node): # pahuñca (to reach), dekha (to look), phara (to return), cala (to walk), # caṛha (to climb), saṛa (to rot), nikala (to get out), nikāla (to remove), girā (to drop), # samā (to encounter), dhamaka (to bully), khaḍā (to stand), daboca (to catch), - # gujara (to pass). + # gujara (to pass), ghera (to surround), baca (to escape). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. - hicompound = ['ले', 'दे', 'डाल', 'फेंक', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकल', 'निकाल', 'गिरा', 'समा', 'धमक', 'खडा', 'दबोच', 'गुजर'] + hicompound = ['ले', 'दे', 'डाल', 'फेंक', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकल', 'निकाल', 'गिरा', 'समा', 'धमक', 'खडा', 'दबोच', 'गुजर', 'फूंक', 'घेर', 'बच'] urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. @@ -27,7 +27,7 @@ def process_node(self, node): # Its form چاہیئے (cāhie) “should, ought to” (literally "is wanted"?) is treated as a separate, derived word, and it is a modal auxiliary. # दिखाना دکھانا (dikhānā) “to show” # बनना بننا (bananā) “to become” - hiphase = ['लग', 'चुक', 'चाह', 'दिखा', 'बन'] + hiphase = ['लग', 'चुक', 'चाह', 'दिखा', 'बन', 'करा'] urphase = ['لگ', 'چک', 'چاہ', 'دکھا', 'بن'] rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' if re.match(recompound, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': From 5a239dd41c325b1583fe635b9854a2e68d2893a9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 30 Sep 2022 13:21:36 +0200 Subject: [PATCH 295/871] FixLeaf: case and mark should be among the defaults. --- udapi/block/ud/fixleaf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/fixleaf.py b/udapi/block/ud/fixleaf.py index d715ec01..9b4ce191 100644 --- a/udapi/block/ud/fixleaf.py +++ b/udapi/block/ud/fixleaf.py @@ -8,11 +8,11 @@ class FixLeaf(Block): """ - Make sure that aux and cop dependents are leaves unless one of the known - exceptions applies. + Make sure that function words are leaves unless one of the known exceptions + applies. """ - def __init__(self, deprels='aux,cop,cc', **kwargs): + def __init__(self, deprels='aux,cop,case,mark,cc', **kwargs): """ Args: deprels: comma-separated list of deprels to be fixed. Default = aux,cop,case,mark,cc. From b2d1ea858224ffb8ef37929e4e998b524dc9d98a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 30 Sep 2022 13:53:01 +0200 Subject: [PATCH 296/871] Paired punctuation must not cause non-projectivity if an outside node depends on an inside node. --- udapi/block/ud/fixpunct.py | 58 +++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index 95cb40d0..15d310c7 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -27,15 +27,15 @@ '{': '}', '"': '"', # ASCII double quotes "'": "'", # ASCII single quotes - '“': '”', # quotation marks used in English,... - '„': '“', # Czech, German, Russian,... - '«': '»', # French, Russian, Spanish,... + '“': '”', # quotation marks used in English, ... + '„': '“', # Czech, German, Russian, ... + '«': '»', # French, Russian, Spanish, ... '‹': '›', # dtto '《': '》', # Korean, Chinese '「': '」', # Chinese, Japanese - '『': '』', # dtto - '¿': '?', # Spanish question quotation marks - '¡': '!', # Spanish exclamation quotation marks + '『': '』', # ditto + '¿': '?', # Spanish paired question marks + '¡': '!', # Spanish paired exclamation marks } FINAL_PUNCT = '.?!' @@ -65,7 +65,7 @@ def process_tree(self, root): # This may introduce multiple subroots, which will be fixed later on # (preventing to temporarily create multiple subroots here would prevent fixing some errors). for node in root.descendants: - while node.parent.upos == "PUNCT": + while node.parent.upos == 'PUNCT': node.parent = node.parent.parent # Second, fix paired punctuations: quotes and brackets, marking them in _punct_type. @@ -83,10 +83,9 @@ def process_tree(self, root): # Third, fix subordinate punctuation (i.e. any punctuation not marked in _punct_type). for node in root.descendants: - if node.upos == "PUNCT" and not self._punct_type[node.ord]: + if node.upos == 'PUNCT' and not self._punct_type[node.ord]: self._fix_subord_punct(node) - # UD requires "exactly one word is the head of the sentence, dependent on a notional ROOT", i.e. a single "subroot". # This seems to be a stronger rule than no-PUNCT-children because it is checked by the validator. # So lets prevent multiple subroots (at the cost of possibly re-introducing PUNCT-children). @@ -107,7 +106,7 @@ def process_tree(self, root): # TODO: This block changes parents not only for PUNCT nodes. These should be reflected into enhanced deps as well. if self.copy_to_enhanced: for node in root.descendants: - if node.upos == "PUNCT": + if node.upos == 'PUNCT': node.deps = [{'parent': node.parent, 'deprel': 'punct'}] def _fix_subord_punct(self, node): @@ -131,12 +130,12 @@ def _fix_subord_punct(self, node): l_cand, r_cand = node.prev_node, node.next_node if node.form in FINAL_PUNCT: r_cand = None - while l_cand.ord > 0 and l_cand.upos == "PUNCT": + while l_cand.ord > 0 and l_cand.upos == 'PUNCT': if self._punct_type[l_cand.ord] == 'opening' and l_cand.parent != node: l_cand = None break l_cand = l_cand.prev_node - while r_cand is not None and r_cand.upos == "PUNCT": + while r_cand is not None and r_cand.upos == 'PUNCT': if self._punct_type[r_cand.ord] == 'closing' and r_cand.parent != node: r_cand = None break @@ -193,7 +192,7 @@ def _fix_subord_punct(self, node): # We try to be conservative and keep the parent, unless we are sure it is wrong. if node.parent not in path: node.parent = cand - node.deprel = "punct" + node.deprel = 'punct' def _will_be_projective(self, node, cand): node.parent = cand @@ -206,7 +205,6 @@ def _fix_paired_punct(self, root, opening_node, closing_punct): if (self.check_paired_punct_upos or opening_node.form == "'") and opening_node.upos != 'PUNCT': return - nested_level = 0 for node in root.descendants[opening_node.ord:]: if node.form == closing_punct: @@ -219,14 +217,31 @@ def _fix_paired_punct(self, root, opening_node, closing_punct): nested_level += 1 def _fix_pair(self, root, opening_node, closing_node): + # Ideally, paired punctuation symbols should be attached to the single + # head of the subtree inside. Provided the inside segment is a single + # subtree. heads = [] punct_heads = [] - for node in root.descendants[opening_node.ord: closing_node.ord - 1]: - if node.parent.precedes(opening_node) or closing_node.precedes(node.parent): - if node.upos == 'PUNCT': - punct_heads.append(node) - else: - heads.append(node) + for node in root.descendants: + if node == opening_node or node == closing_node: + continue + # If this is a node inside of the pair, is its parent outside? + if opening_node.precedes(node) and node.precedes(closing_node): + if node.parent.precedes(opening_node) or closing_node.precedes(node.parent): + if node.upos == 'PUNCT': + punct_heads.append(node) + else: + heads.append(node) + # Not only the punctuation symbols must not be attached non-projectively, + # they also must not cause non-projectivity of other relations. This could + # happen if an outside node is attached to an inside node. To account for + # this, mark the inside parent as a head, too. + else: + if opening_node.precedes(node.parent) and node.parent.precedes(closing_node): + if node.parent.upos == 'PUNCT': + punct_heads.append(node.parent) + else: + heads.append(node.parent) # Punctuation should not have children, but if there is no other head candidate, # let's break this rule. @@ -246,6 +261,9 @@ def _fix_pair(self, root, opening_node, closing_node): # However, this means that the paired punctuation will be attached non-projectively, # which is forbidden by the UD guidelines. # Thus, we will choose the nearest head, which is the only way how to prevent non-projectivities. + # Sort the heads by their ords (this is not guaranteed because we were adding a mixture of + # inside heads and inside parents of outside nodes). + heads.sort(key=lambda x: x.ord) opening_node.parent = heads[0] closing_node.parent = heads[-1] From 559ec080cdf924289c95468b94640053c069621b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 11 Nov 2022 10:27:35 +0100 Subject: [PATCH 297/871] Optionally fix non-copula auxiliaries alongside copulas. --- udapi/block/ud/fixpseudocop.py | 53 +++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/udapi/block/ud/fixpseudocop.py b/udapi/block/ud/fixpseudocop.py index ab07eaaa..ecc5f0bd 100644 --- a/udapi/block/ud/fixpseudocop.py +++ b/udapi/block/ud/fixpseudocop.py @@ -7,33 +7,40 @@ class FixPseudoCop(Block): - def __init__(self, lemma, **kwargs): + def __init__(self, lemmas, noncopaux=False, **kwargs): """Create the ud.FixPseudoCop block instance. Args: - lemma: the lemma of the pseudocopula that should be fixed + lemmas: comma-separated list of lemmas of the pseudocopulas that should be fixed + noncopaux: do the same for non-copula auxiliaries with the given lemma """ super().__init__(**kwargs) - self.lemma = lemma + self.lemmas = lemmas.split(',') + self.noncopaux = noncopaux def process_node(self, node): - pseudocop = self.lemma - if node.lemma == pseudocop and node.udeprel == "cop": - secpred = node.parent - grandparent = secpred.parent - node.parent = grandparent - node.deprel = secpred.deprel - secpred.parent = node - secpred.deprel = "xcomp" - ###!!! We should also take care of DEPS if they exist. - # As a copula, the word was tagged AUX. Now it should be VERB. - node.upos = "VERB" - # Examine the children of the original parent. - # Those that modify the clause should be re-attached to me. - # Those that modify the word (noun, adjective) should stay there. - for c in secpred.children: - # obl is borderline. It could modify an adjective rather than a clause. - # obj and iobj should not occur in copular clauses but it sometimes - # occurs with pseudocopulas: "I declare him handsome." - if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): - c.parent = node + pseudocop = self.lemmas + if node.lemma in pseudocop: + # Besides spurious copulas, this block can be optionally used to fix spurious auxiliaries (if noncopaux is set). + if node.udeprel == 'cop' or self.noncopaux and node.udeprel == 'aux': + secpred = node.parent + grandparent = secpred.parent + node.parent = grandparent + node.deprel = secpred.deprel + secpred.parent = node + secpred.deprel = "xcomp" + ###!!! We should also take care of DEPS if they exist. + # As a copula, the word was tagged AUX. Now it should be VERB. + node.upos = "VERB" + # Examine the children of the original parent. + # Those that modify the clause should be re-attached to me. + # Those that modify the word (noun, adjective) should stay there. + for c in secpred.children: + # obl is borderline. It could modify an adjective rather than a clause. + # obj and iobj should not occur in copular clauses but it sometimes + # occurs with pseudocopulas: "I declare him handsome." + if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): + c.parent = node + # Another possible error is that the word is tagged AUX without being attached as "cop" or "aux". + elif self.noncopaux and node.upos == 'AUX': + node.upos = 'VERB' From ea1c77b57cfbd3cea9e7afa26a789bc688bf7003 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 11 Nov 2022 11:17:36 +0100 Subject: [PATCH 298/871] Delete --use-feature=in-tree-build This feature was removed in pip 22.3 (in-tree builds are now the default) and it results in CircleCI fails. --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7be539d2..4e88d664 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -35,7 +35,7 @@ jobs: # pip-dependency-file: test-requirements.txt # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements. - run: name: Install Udapi - command: pip install --use-feature=in-tree-build ".[test]" + command: pip install ".[test]" - run: name: Run pytest tests # This assumes pytest is installed via the install-package step above From 1ef23f857b8f8c9b7080b9c33d2dde56a14abf1f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 16:58:39 +0100 Subject: [PATCH 299/871] Reworked feature checking so that a similar block can be written for another language. --- udapi/block/ud/cs/markfeatsbugs.py | 37 ++------------- udapi/block/ud/markfeatsbugs.py | 75 ++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 32 deletions(-) create mode 100644 udapi/block/ud/markfeatsbugs.py diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index 11ecd6d9..3fb8d058 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -2,12 +2,15 @@ Block to identify missing or ill-valued features in Czech. Any bugs that it finds will be saved in the MISC column as a Bug attribute, which can be later used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 attributes=form,lemma,upos,xpos,feats,deprel,misc """ -from udapi.core.block import Block +import udapi.block.ud.markfeatsbugs import logging import re -class MarkFeatsBugs(Block): +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): # The convention used in PDT is not consistent. Adjectives are fully disambiguated # (three genders, two animacies, three numbers, seven cases), even though some @@ -21,36 +24,6 @@ class MarkFeatsBugs(Block): # in the future. pdt20 = False # True = like in PDT 2.0; False = like in ČNK - def bug(self, node, bugstring): - bugs = [] - if node.misc['Bug']: - bugs = node.misc['Bug'].split('+') - if not bugstring in bugs: - bugs.append(bugstring) - node.misc['Bug'] = '+'.join(bugs) - - def check_allowed_features(self, node, allowed): - """ - We need a dictionary indexed by feature names that are allowed; for each - feature name, there is a list of allowed values. - """ - # Check for features that are not allowed but the node has them. - # For features that are allowed, check that their values are allowed. - for f in node.feats: - if f in allowed: - if not node.feats[f] in allowed[f]: - self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed') - else: - self.bug(node, 'Feat' + f + 'NotAllowed') - - def check_required_features(self, node, required): - """ - We need a list of names of features whose values must not be empty. - """ - for f in required: - if not f in node.feats: - self.bug(node, 'Feat' + f + 'Missing') - def process_node(self, node): # NOUNS ################################################################ if node.upos == 'NOUN': diff --git a/udapi/block/ud/markfeatsbugs.py b/udapi/block/ud/markfeatsbugs.py new file mode 100644 index 00000000..b24dcecb --- /dev/null +++ b/udapi/block/ud/markfeatsbugs.py @@ -0,0 +1,75 @@ +""" +Block to identify missing or ill-valued features in a treebank. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. This is a base block that only +implements service methods. A language-specific block must be derived from this +one and define the actual rules valid in that language. + +Usage (Czech example): cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html +""" +from udapi.core.block import Block +import logging +import re + +class MarkFeatsBugs(Block): + + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def check_allowed_features(self, node, allowed): + """ + We need a dictionary indexed by feature names that are allowed; for each + feature name, there is a list of allowed values. + """ + # Check for features that are not allowed but the node has them. + # For features that are allowed, check that their values are allowed. + for f in node.feats: + if f in allowed: + if not node.feats[f] in allowed[f]: + self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed') + else: + self.bug(node, 'Feat' + f + 'NotAllowed') + + def check_required_features(self, node, required): + """ + We need a list of names of features whose values must not be empty. + """ + for f in required: + if not f in node.feats: + self.bug(node, 'Feat' + f + 'Missing') + + def process_node(self, node): + """ + This is a generic block, do nothing here. In a language-specific block + based on this one, rules similar to the examples below can be specified: + + # NOUNS ################################################################ + if node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + #... + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) + """ + return From 5a836db5852a97a69b972646493024894a7d3ca4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 17:37:16 +0100 Subject: [PATCH 300/871] Added Latin. --- udapi/block/ud/cs/markfeatsbugs.py | 2 +- udapi/block/ud/la/markfeatsbugs.py | 608 +++++++++++++++++++++++++++++ 2 files changed, 609 insertions(+), 1 deletion(-) create mode 100644 udapi/block/ud/la/markfeatsbugs.py diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index 3fb8d058..ef203033 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -4,7 +4,7 @@ used in filters and highlighted in text output. Usage: cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html -Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 attributes=form,lemma,upos,xpos,feats,deprel,misc +Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs import logging diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py new file mode 100644 index 00000000..8741eabb --- /dev/null +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -0,0 +1,608 @@ +""" +Block to identify missing or ill-valued features in Latin. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAM ud.la.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import logging +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + def process_node(self, node): + # NOUNS ################################################################ + if node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + # PROPER NOUNS ######################################################### + elif node.upos == 'PROPN': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'NameType': ['Giv', 'Sur', 'Geo'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'NameType': ['Giv', 'Sur', 'Geo'], + 'Foreign': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + if node.feats['Poss'] == 'Yes': # possessive adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names + 'Foreign': ['Yes']}) + elif node.feats['NumType'] == 'Ord': # ordinal numerals are a subtype of adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives + self.check_required_features(node, ['VerbForm', 'Voice']) + if node.feats['Voice'] == 'Act': # active participles have tense, passives don't + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + elif node.feats['Variant'] == 'Short': # short (nominal) forms of adjectives have no degree + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Variant']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity', 'Variant']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: # regular adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + self.check_required_features(node, ['PronType']) + if node.feats['PronType'] == 'Prs': + if node.feats['Reflex'] == 'Yes': + self.check_required_features(node, ['PronType', 'Reflex', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Reflex': ['Yes'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + else: # not reflexive + if node.feats['Person'] == '3': # on, ona, ono, oni, ony + if re.match(r'^(Nom|Voc)$', node.feats['Case']): # on, ona, ono, oni, ony + self.check_adjective_like(node, ['PronType', 'Person'], { + 'PronType': ['Prs'], + 'Person': ['3'] + }) + else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně + # Mostly only two gender groups and no animacy: + # Masc,Neut ... jeho, jemu, jej, něm, jím + # Fem ... jí, ji, ní + # Neut ... je + # No gender in dual and plural: + # Plur ... jich, jim, je, nich, jimi + self.check_adjective_like(node, ['PronType', 'Person', 'PrepCase'], { + 'PronType': ['Prs'], + 'Person': ['3'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # 1st and 2nd person do not have gender: já, ty + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + elif re.search(r'k[dt]o', node.lemma): # kdo (kto), kdož, někdo, nikdo + # There is no Number. Někdo and nikdo behave like singular; + # kdo is by default singular as well but it also occurs as a subject + # of plural verbs. + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'Gender': ['Masc'], + 'Animacy': ['Anim'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif re.match(r'^(co|což|něco|nicož)$', node.lemma): + # Although these pronouns behave by default as neuter singular, + # no Gender and Number is annotated. However, quite unusually, + # there is Animacy=Inan without Gender. + ###!!! This should probably be fixed in all Czech treebanks and + ###!!! in Interset. The pronoun should get Gender=Neut and no + ###!!! animacy. For now, let's at least make animacy an optional + ###!!! feature (I see that we already do not fill it in the Old + ###!!! Czech data). + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'Animacy': ['Inan'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif node.lemma == 'ješto': + # Unlike 'jenžto', this relative pronoun does not inflect, it + # always occurs in a nominative position, but the context can + # be any gender and number. + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Case': ['Nom'] + }) + elif re.match(r'^(jenž|jenžto)$', node.lemma): + # The relative pronouns 'jenž', 'jenžto' inflect for gender; + # while we normally take this as a sign of DET (instead of PRON), + # these can never act as real DET because they never modify a + # nominal. + # Similarly to the personal pronoun 'on', animacy is only + # annotated for masculine nominative plural, non-nominative + # forms are merged for masculine and neuter (jehož, jemuž), and + # non-singular gender is only annotated in nominative (while + # these cases are common for all genders: jichž, jimž, jimiž). + # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even + # in the nominative, although there is no prepositional counter- + # part (but similarly the locative has no prepositionless form). + self.check_adjective_like(node, ['PronType', 'PrepCase'], { + 'PronType': ['Rel'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: + # What remains is the relative pronoun 'an'. It behaves similarly + # to 'jenž' but it does not have the PrepCase feature and it + # only occurs in the nominative. + if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': # ani + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Plur'], + 'Case': ['Nom'] + }) + else: # not Masc Plur: an, ana, ano, any + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom'] + }) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. + # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. + if re.match(r'^(jeho|jejich|jich)(ž(to)?)?$', node.form.lower()): + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc,Neut'] + }) + elif re.match(r'^(její|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(to)?)?$', node.form.lower()): + # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. + # Congruent gender is annotated only in singular. Masculine and + # neuter are merged even in nominative. Feminine singular does + # not distinguish case in PDT but we need it in Old Czech at + # least for 'jejiej'. + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' + if node.feats['Reflex'] == 'Yes': + self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'] + }) + else: + self.check_adjective_like(node, ['PronType', 'Poss', 'Person', 'Number[psor]'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['1', '2'], + 'Number[psor]': ['Sing', 'Plur'] + }) + else: + self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp']}) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'] + }) + else: + ###!!! Somehow the NumValue feature from PDT via Interset is useless. + # 'jeden' has Gender, Animacy, Number, Case: jeden, jedna, jedno, jednoho, jednomu, jednom, jedním, jedné, jednu, jednou, jedni, jedny, jedněch, jedněm, jedněmi. + # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma. + # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi. + # 'pět' and more have Number=Plur, Case: pět, pěti. + if node.lemma == 'jeden': + self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(dva|oba)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Gender', 'Number', 'Case']) + if self.pdt20: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # VERBS AND AUXILIARIES ################################################ + elif re.match(r'^(VERB|AUX)$', node.upos): + self.check_required_features(node, ['Aspect', 'VerbForm']) + if node.feats['VerbForm'] == 'Inf': + # There is no voice. For some reason, PDT does not annotate that + # the infinitive form is active (while a passive infinitive is + # a combination of the infinitive with a passive participle). + self.check_required_features(node, ['Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['VerbForm'] == 'Fin': + # Voice is optional. For some reason it is not annotated with + # imperatives (although passive imperatives are a combination + # of the active imperative and a passive participle). It is + # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'. + if node.feats['Mood'] == 'Cnd': + self.check_required_features(node, ['Mood', 'Person']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'] # optional: it is not annotated in the third person + }) + elif node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person) + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # indicative + self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Voice': ['Act'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short', 'Long'] # distinguishes sigmatic (Long) and asigmatic (Short) aorist + }) + elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # converb + self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular, and no animacy + 'Polarity': ['Pos', 'Neg'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] + }) + elif node.feats['Degree'] != '': + # Adverbs that are compared can also be negated. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_required_features(node, ['AdpType', 'Case']) + self.check_allowed_features(node, { + 'AdpType': ['Prep', 'Voc'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) + + def check_adjective_like(self, node, r0, a0): + """ + Long form of adjectives, pronouns and determiners mostly share declension + paradigms and thus the sets of features that are expected. Whether the + actual feature sets are the same depends on the tagging convention (PDT + vs. ČNK): in PDT, adjectives are fully disambiguated while pronouns are + not; in ČNK, both adjectives and pronouns (incl. determiners) are fully + disambiguated. This method defines the core inflectional features while + any extras (such as PronType for pronouns) have to be provided by the + caller in parameters r0 (list) and a0 (dict). + """ + required_features = [] + allowed_featurs = {} + full_set = node.upos == 'ADJ' or not self.pdt20 + if full_set: + # Even in the full set, animacy is only distinguished for the + # masculine gender. + if node.feats['Gender'] == 'Masc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + # Gender is annotated in all cases in singular (ten, ta, to) + # but only in nominative, accusative, and vocative in plural + # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished + # in plural if gender is distinguished and it is masculine; in + # singular it is distinguished only in accusative (toho, ten). + # Other cases in plural are gender-less (těch, těm, těmi). + # Note that this is not consistent with adjectives, where we + # disambiguate gender in all cases in plural. + if node.feats['Number'] == 'Sing': + if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing'], + 'Case': ['Acc'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Acc', 'Voc'] + } + else: + required_features = ['Number', 'Case'] + allowed_features = { + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] + } + required_features = r0 + required_features + a0.update(allowed_features) + allowed_features = a0 + self.check_required_features(node, required_features) + self.check_allowed_features(node, allowed_features) From 0f167c2a64adcb98a61740c348e3b7579502005d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 17:47:54 +0100 Subject: [PATCH 301/871] Removed Czech-specific rules from Latin block. For a start, the Latin rules check NOUNs and PROPNs only. --- udapi/block/ud/la/markfeatsbugs.py | 598 +---------------------------- 1 file changed, 11 insertions(+), 587 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 8741eabb..4cf6c1b3 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -16,593 +16,17 @@ def process_node(self, node): # NOUNS ################################################################ if node.upos == 'NOUN': self.check_required_features(node, ['Gender', 'Number', 'Case']) - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Animacy']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) - else: - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Foreign': ['Yes']}) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Animacy']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) - else: - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) - # ADJECTIVES ########################################################### - elif node.upos == 'ADJ': - if node.feats['Poss'] == 'Yes': # possessive adjectives - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Animacy', 'Number', 'Case']) - self.check_allowed_features(node, { - 'Poss': ['Yes'], - 'Gender[psor]': ['Masc', 'Fem'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'Poss': ['Yes'], - 'Gender[psor]': ['Masc', 'Fem'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names - 'Foreign': ['Yes']}) - elif node.feats['NumType'] == 'Ord': # ordinal numerals are a subtype of adjectives - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) - self.check_allowed_features(node, { - 'NumType': ['Ord'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'NumType': ['Ord'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) - elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives - self.check_required_features(node, ['VerbForm', 'Voice']) - if node.feats['Voice'] == 'Act': # active participles have tense, passives don't - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Act'], - 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Act'], - 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Pass'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Pass'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - elif node.feats['Variant'] == 'Short': # short (nominal) forms of adjectives have no degree - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Variant']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity', 'Variant']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: # regular adjectives - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) - # PRONOUNS ############################################################# - elif node.upos == 'PRON': - self.check_required_features(node, ['PronType']) - if node.feats['PronType'] == 'Prs': - if node.feats['Reflex'] == 'Yes': - self.check_required_features(node, ['PronType', 'Reflex', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Reflex': ['Yes'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], - 'Variant': ['Short'] - }) - else: # not reflexive - if node.feats['Person'] == '3': # on, ona, ono, oni, ony - if re.match(r'^(Nom|Voc)$', node.feats['Case']): # on, ona, ono, oni, ony - self.check_adjective_like(node, ['PronType', 'Person'], { - 'PronType': ['Prs'], - 'Person': ['3'] - }) - else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně - # Mostly only two gender groups and no animacy: - # Masc,Neut ... jeho, jemu, jej, něm, jím - # Fem ... jí, ji, ní - # Neut ... je - # No gender in dual and plural: - # Plur ... jich, jim, je, nich, jimi - self.check_adjective_like(node, ['PronType', 'Person', 'PrepCase'], { - 'PronType': ['Prs'], - 'Person': ['3'], - 'PrepCase': ['Npr', 'Pre'] - }) - else: # 1st and 2nd person do not have gender: já, ty - self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['1', '2'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Variant': ['Short'] - }) - elif re.search(r'k[dt]o', node.lemma): # kdo (kto), kdož, někdo, nikdo - # There is no Number. Někdo and nikdo behave like singular; - # kdo is by default singular as well but it also occurs as a subject - # of plural verbs. - self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], - 'Gender': ['Masc'], - 'Animacy': ['Anim'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] - }) - elif re.match(r'^(co|což|něco|nicož)$', node.lemma): - # Although these pronouns behave by default as neuter singular, - # no Gender and Number is annotated. However, quite unusually, - # there is Animacy=Inan without Gender. - ###!!! This should probably be fixed in all Czech treebanks and - ###!!! in Interset. The pronoun should get Gender=Neut and no - ###!!! animacy. For now, let's at least make animacy an optional - ###!!! feature (I see that we already do not fill it in the Old - ###!!! Czech data). - self.check_required_features(node, ['PronType', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], - 'Animacy': ['Inan'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] - }) - elif node.lemma == 'ješto': - # Unlike 'jenžto', this relative pronoun does not inflect, it - # always occurs in a nominative position, but the context can - # be any gender and number. - self.check_required_features(node, ['PronType', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Case': ['Nom'] - }) - elif re.match(r'^(jenž|jenžto)$', node.lemma): - # The relative pronouns 'jenž', 'jenžto' inflect for gender; - # while we normally take this as a sign of DET (instead of PRON), - # these can never act as real DET because they never modify a - # nominal. - # Similarly to the personal pronoun 'on', animacy is only - # annotated for masculine nominative plural, non-nominative - # forms are merged for masculine and neuter (jehož, jemuž), and - # non-singular gender is only annotated in nominative (while - # these cases are common for all genders: jichž, jimž, jimiž). - # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even - # in the nominative, although there is no prepositional counter- - # part (but similarly the locative has no prepositionless form). - self.check_adjective_like(node, ['PronType', 'PrepCase'], { - 'PronType': ['Rel'], - 'PrepCase': ['Npr', 'Pre'] - }) - else: - # What remains is the relative pronoun 'an'. It behaves similarly - # to 'jenž' but it does not have the PrepCase feature and it - # only occurs in the nominative. - if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': # ani - self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Plur'], - 'Case': ['Nom'] - }) - else: # not Masc Plur: an, ana, ano, any - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom'] - }) - # DETERMINERS ########################################################## - elif node.upos == 'DET': - # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. - # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. - if re.match(r'^(jeho|jejich|jich)(ž(to)?)?$', node.form.lower()): - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) - self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], - 'Poss': ['Yes'], - 'Person': ['3'], - 'Number[psor]': ['Sing', 'Dual', 'Plur'], - 'Gender[psor]': ['Masc,Neut'] - }) - elif re.match(r'^(její|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(to)?)?$', node.form.lower()): - # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. - # Congruent gender is annotated only in singular. Masculine and - # neuter are merged even in nominative. Feminine singular does - # not distinguish case in PDT but we need it in Old Czech at - # least for 'jejiej'. - if node.feats['Number'] == 'Sing': - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], - 'Poss': ['Yes'], - 'Person': ['3'], - 'Number[psor]': ['Sing'], - 'Gender[psor]': ['Fem'], - 'Gender': ['Masc,Neut', 'Fem'], - 'Number': ['Sing'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - else: - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], - 'Poss': ['Yes'], - 'Person': ['3'], - 'Number[psor]': ['Sing'], - 'Gender[psor]': ['Fem'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' - if node.feats['Reflex'] == 'Yes': - self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'] - }) - else: - self.check_adjective_like(node, ['PronType', 'Poss', 'Person', 'Number[psor]'], { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Person': ['1', '2'], - 'Number[psor]': ['Sing', 'Plur'] - }) - else: - self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp']}) - # NUMERALS ############################################################# - elif node.upos == 'NUM': - self.check_required_features(node, ['NumType', 'NumForm']) - # Arabic digits and Roman numerals do not have inflection features. - if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Digit', 'Roman'] - }) - else: - ###!!! Somehow the NumValue feature from PDT via Interset is useless. - # 'jeden' has Gender, Animacy, Number, Case: jeden, jedna, jedno, jednoho, jednomu, jednom, jedním, jedné, jednu, jednou, jedni, jedny, jedněch, jedněm, jedněmi. - # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma. - # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi. - # 'pět' and more have Number=Plur, Case: pět, pěti. - if node.lemma == 'jeden': - self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Number', 'Case']) - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - elif re.match(r'^(dva|oba)$', node.lemma): - self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Gender', 'Number', 'Case']) - if self.pdt20: - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - else: - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - else: - self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - # VERBS AND AUXILIARIES ################################################ - elif re.match(r'^(VERB|AUX)$', node.upos): - self.check_required_features(node, ['Aspect', 'VerbForm']) - if node.feats['VerbForm'] == 'Inf': - # There is no voice. For some reason, PDT does not annotate that - # the infinitive form is active (while a passive infinitive is - # a combination of the infinitive with a passive participle). - self.check_required_features(node, ['Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Inf'], - 'Polarity': ['Pos', 'Neg'] - }) - elif node.feats['VerbForm'] == 'Fin': - # Voice is optional. For some reason it is not annotated with - # imperatives (although passive imperatives are a combination - # of the active imperative and a passive participle). It is - # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'. - if node.feats['Mood'] == 'Cnd': - self.check_required_features(node, ['Mood', 'Person']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Fin'], - 'Mood': ['Cnd'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Dual', 'Plur'] # optional: it is not annotated in the third person - }) - elif node.feats['Mood'] == 'Imp': - self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Fin'], - 'Mood': ['Imp'], - 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person) - 'Number': ['Sing', 'Dual', 'Plur'], - 'Polarity': ['Pos', 'Neg'] - }) - else: # indicative - self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Fin'], - 'Mood': ['Ind'], - 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative - 'Voice': ['Act'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short', 'Long'] # distinguishes sigmatic (Long) and asigmatic (Short) aorist - }) - elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Part'], - 'Tense': ['Past'], - 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB - 'Number': ['Sing', 'Dual', 'Plur'], - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Polarity': ['Pos', 'Neg'] - }) - else: - self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Part'], - 'Tense': ['Past'], - 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB - 'Number': ['Sing', 'Dual', 'Plur'], - 'Gender': ['Fem', 'Neut'], - 'Polarity': ['Pos', 'Neg'] - }) - else: # converb - self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Conv'], - 'Tense': ['Past', 'Pres'], - 'Voice': ['Act'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular, and no animacy - 'Polarity': ['Pos', 'Neg'] - }) - # ADVERBS ############################################################## - elif node.upos == 'ADV': - if node.feats['PronType'] != '': - # Pronominal adverbs are neither compared nor negated. - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] - }) - elif node.feats['Degree'] != '': - # Adverbs that are compared can also be negated. - self.check_required_features(node, ['Degree', 'Polarity']) - self.check_allowed_features(node, { - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'] - }) - else: - # The remaining adverbs are neither pronominal, nor compared or - # negated. - self.check_allowed_features(node, {}) - # ADPOSITIONS ########################################################## - elif node.upos == 'ADP': - self.check_required_features(node, ['AdpType', 'Case']) + self.check_required_features(node, ['Gender', 'Number', 'Case']) self.check_allowed_features(node, { - 'AdpType': ['Prep', 'Voc'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'] - }) - # THE REST: NO FEATURES ################################################ - else: - self.check_allowed_features(node, {}) - - def check_adjective_like(self, node, r0, a0): - """ - Long form of adjectives, pronouns and determiners mostly share declension - paradigms and thus the sets of features that are expected. Whether the - actual feature sets are the same depends on the tagging convention (PDT - vs. ČNK): in PDT, adjectives are fully disambiguated while pronouns are - not; in ČNK, both adjectives and pronouns (incl. determiners) are fully - disambiguated. This method defines the core inflectional features while - any extras (such as PronType for pronouns) have to be provided by the - caller in parameters r0 (list) and a0 (dict). - """ - required_features = [] - allowed_featurs = {} - full_set = node.upos == 'ADJ' or not self.pdt20 - if full_set: - # Even in the full set, animacy is only distinguished for the - # masculine gender. - if node.feats['Gender'] == 'Masc': - required_features = ['Gender', 'Animacy', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - } - else: - required_features = ['Gender', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - } - else: - # Gender is annotated in all cases in singular (ten, ta, to) - # but only in nominative, accusative, and vocative in plural - # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished - # in plural if gender is distinguished and it is masculine; in - # singular it is distinguished only in accusative (toho, ten). - # Other cases in plural are gender-less (těch, těm, těmi). - # Note that this is not consistent with adjectives, where we - # disambiguate gender in all cases in plural. - if node.feats['Number'] == 'Sing': - if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': - required_features = ['Gender', 'Animacy', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing'], - 'Case': ['Acc'] - } - else: - required_features = ['Gender', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular - 'Number': ['Sing'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - } - elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): - required_features = ['Gender', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Acc', 'Voc'] - } - else: - required_features = ['Number', 'Case'] - allowed_features = { - 'Number': ['Dual', 'Plur'], - 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] - } - required_features = r0 + required_features - a0.update(allowed_features) - allowed_features = a0 - self.check_required_features(node, required_features) - self.check_allowed_features(node, allowed_features) + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'NameType': ['Giv', 'Sur', 'Geo'], + 'Foreign': ['Yes']}) From 606515a088cc9779b3fef46795a0c4a6bb1f6613 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 17:51:05 +0100 Subject: [PATCH 302/871] Usage layout=compact. --- udapi/block/ud/cs/markfeatsbugs.py | 2 +- udapi/block/ud/la/markfeatsbugs.py | 2 +- udapi/block/ud/markfeatsbugs.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index ef203033..309e7ac8 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -3,7 +3,7 @@ finds will be saved in the MISC column as a Bug attribute, which can be later used in filters and highlighted in text output. -Usage: cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html +Usage: cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 4cf6c1b3..8aea567f 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -3,7 +3,7 @@ finds will be saved in the MISC column as a Bug attribute, which can be later used in filters and highlighted in text output. -Usage: cat *.conllu | udapy -HAM ud.la.MarkFeatsBugs > bugs.html +Usage: cat *.conllu | udapy -HAMX layout=compact ud.la.MarkFeatsBugs > bugs.html Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs diff --git a/udapi/block/ud/markfeatsbugs.py b/udapi/block/ud/markfeatsbugs.py index b24dcecb..1bb8188b 100644 --- a/udapi/block/ud/markfeatsbugs.py +++ b/udapi/block/ud/markfeatsbugs.py @@ -5,7 +5,7 @@ implements service methods. A language-specific block must be derived from this one and define the actual rules valid in that language. -Usage (Czech example): cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html +Usage (Czech example): cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html """ from udapi.core.block import Block import logging From 204da3bbb4bfa59c085c4c05a6bc8be2e134e27d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 21:05:39 +0100 Subject: [PATCH 303/871] More rules for Latin features (cloned from Czech). --- udapi/block/ud/la/markfeatsbugs.py | 148 +++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 8aea567f..96c7b682 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -30,3 +30,151 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'NameType': ['Giv', 'Sur', 'Geo'], 'Foreign': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Foreign': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + self.check_required_features(node, ['PronType']) + if node.feats['PronType'] == 'Prs': + if node.feats['Reflex'] == 'Yes': + self.check_required_features(node, ['PronType', 'Reflex', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Reflex': ['Yes'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Abl'] + }) + else: # not reflexive + if node.feats['Person'] == '3': # on, ona, ono, oni, ony + self.check_required_features(node, ['PronType', 'Person', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + else: # 1st and 2nd person do not have gender: já, ty + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['1', '2', '3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + else: + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + # VERBS AND AUXILIARIES ################################################ + elif re.match(r'^(VERB|AUX)$', node.upos): + self.check_required_features(node, ['Aspect', 'VerbForm']) + if node.feats['VerbForm'] == 'Inf': + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # indicative or subjunctive + self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Sub'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Voice': ['Act'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['VerbForm'] == 'Part': + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Plur'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # verbal noun + self.check_required_features(node, ['Tense', 'Number', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], + 'Number': ['Sing', 'Plur'], + 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular + 'Polarity': ['Pos', 'Neg'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) + # PARTICLES ############################################################ + elif node.upos == 'PART': + self.check_allowed_features(node, { + 'Polarity': ['Neg'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) From 98db11584577be72a5748c8c81cb4030348270c0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 22:32:50 +0100 Subject: [PATCH 304/871] Added feature rules for Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 191 +++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 udapi/block/ud/ml/markfeatsbugs.py diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py new file mode 100644 index 00000000..a46580d1 --- /dev/null +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -0,0 +1,191 @@ +""" +Block to identify missing or ill-valued features in Malayalam. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.ml.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.ml.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import logging +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + def process_node(self, node): + # NOUNS AND PROPER NOUNS ############################################### + if re.match(r'^(NOUN|PROPN)$', node.upos): + self.check_required_features(node, ['Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'], + 'Foreign': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + self.check_allowed_features(node, { + 'Foreign': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + self.check_required_features(node, ['PronType']) + if node.feats['PronType'] == 'Prs': + if node.feats['Reflex'] == 'Yes': + self.check_required_features(node, ['PronType', 'Reflex', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Reflex': ['Yes'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + else: # not reflexive + if node.feats['Person'] == '3': # അവൻ avan, അവൾ avaḷ, അത് at, അവർ avaṟ + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Person', 'Deixis', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Deixis': ['Prox', 'Remt'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + else: # plural pronouns do not distinguish gender + self.check_required_features(node, ['PronType', 'Person', 'Deixis', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Deixis': ['Prox', 'Remt'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + else: # 1st and 2nd person do not have gender: ഞാൻ ñān, നീ nī + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['1', '2', '3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + else: + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + # VERBS ################################################################ + elif node.upos == 'VERB': + self.check_required_features(node, ['VerbForm', 'Voice']) + if node.feats['VerbForm'] == 'Inf': + self.check_allowed_features(node, { + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'] + }) + elif node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'] + }) + else: + self.check_required_features(node, ['Mood', 'Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Nec'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'] + }) + elif node.feats['VerbForm'] == 'Part': + self.check_required_features(node, ['Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'] + }) + else: # verbal noun + self.check_required_features(node, ['Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + }) + # AUXILIARIES ########################################################## + elif node.upos == 'AUX': + self.check_required_features(node, ['VerbForm']) + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # indicative or subjunctive + self.check_required_features(node, ['Mood', 'Tense']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Sub'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) + # PARTICLES ############################################################ + elif node.upos == 'PART': + self.check_allowed_features(node, { + 'Polarity': ['Neg'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) From 84965b94e618f1f2b5fb2cdc3a46fca4dc897c5e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 23:44:42 +0100 Subject: [PATCH 305/871] Non-personal pronouns in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index a46580d1..fc25eccb 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -65,6 +65,12 @@ def process_node(self, node): 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] }) + else: # not personal + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) # DETERMINERS ########################################################## elif node.upos == 'DET': if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' From cce7db13d41deeba166ff7a766ae58c6a4fb3db0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Nov 2022 11:41:50 +0100 Subject: [PATCH 306/871] Malayalam determiners have fewer features than pronouns. --- udapi/block/ud/ml/markfeatsbugs.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index fc25eccb..41d4cf09 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -73,23 +73,16 @@ def process_node(self, node): }) # DETERMINERS ########################################################## elif node.upos == 'DET': - if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Gender', 'Number', 'Case']) + if node.feats['PronType'] == 'Art': + self.check_required_features(node, ['PronType', 'Definite']) self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Person': ['1', '2', '3'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'PronType': ['Art'], + 'Definite': ['Ind'] }) else: - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_required_features(node, ['PronType']) self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] }) # NUMERALS ############################################################# elif node.upos == 'NUM': From 8a9435f4b3a510dc1b2f6f34c98ee9f5e9e80b5f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Nov 2022 15:09:34 +0100 Subject: [PATCH 307/871] Added Chinese lemmatization. --- udapi/block/ud/zh/lemmatize.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 udapi/block/ud/zh/lemmatize.py diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py new file mode 100644 index 00000000..7db798a0 --- /dev/null +++ b/udapi/block/ud/zh/lemmatize.py @@ -0,0 +1,34 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + # dictionary: form --> lemma + lemma = { + # The plural suffix -men. + '我們': '我', # trad + '我们': '我', # simp + '他們': '他', # trad + '他们': '他', # simp + '它們': '它', # trad + '它们': '它', # simp + '牠們': '牠', # trad + '她們': '她', # trad + '她们': '她', # simp + '人們': '人', # trad + '人们': '人' # simp + } + + def process_node(self, node): + """ + Parts of the Chinese treebanks lack lemmas. Fortunately, lemmatization + of Sino-Tibetan languages is pretty straightforward most of the time, + as the lemma typically equals to the actual word form. + """ + if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': + if node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form From 72f045ef84ea000f403d210301d33d1acf3f7018 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 26 Nov 2022 12:46:54 +0100 Subject: [PATCH 308/871] Enable rewriting of lemmas in Chinese. --- udapi/block/ud/zh/lemmatize.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 7db798a0..2b7a2dc5 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -5,6 +5,20 @@ class Lemmatize(Block): + def __init__(self, rewrite='empty', **kwargs): + """ + Create the ud.zh.Lemmatize block instance. + + Args: + rewrite=empty: set the lemma if it was empty so far; do not touch the rest + rewrite=form: set the lemma if it was empty or equal to form; do not touch the rest + rewrite=all: set the lemma regardless of what it was previously + """ + super().__init__(**kwargs) + if not re.match(r'^(empty|form|all)$', rewrite): + raise ValueError("Unexpected value of parameter 'rewrite'") + self.rewrite = rewrite + # dictionary: form --> lemma lemma = { # The plural suffix -men. @@ -27,8 +41,11 @@ def process_node(self, node): of Sino-Tibetan languages is pretty straightforward most of the time, as the lemma typically equals to the actual word form. """ - if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': - if node.form in self.lemma: - node.lemma = self.lemma[node.form] - else: - node.lemma = node.form + if self.rewrite == 'empty' and not (node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): + return + elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): + return + if node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form From b4dd844870291532089ce518bb0ad4d1f562d92a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 26 Nov 2022 14:59:31 +0100 Subject: [PATCH 309/871] Use lemmatization to make copulas acceptable. --- udapi/block/ud/zh/lemmatize.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 2b7a2dc5..9b4c7cba 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -45,7 +45,13 @@ def process_node(self, node): return elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): return - if node.form in self.lemma: + # Verbs that are derived from the copula and tagged as the copula need + # to have the lemma of the copula (是 shì). + if re.search(r'是', node.form) and re.match(r'^(AUX|VERB)$', node.upos): + node.lemma = '是' + if node.form == '不是': + node.feats['Polarity'] = 'Neg' + elif node.form in self.lemma: node.lemma = self.lemma[node.form] else: node.lemma = node.form From 3bea246947ce88825cc15f690e0de744b85c37ee Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 26 Nov 2022 23:40:44 +0100 Subject: [PATCH 310/871] Another Chinese copula. --- udapi/block/ud/zh/lemmatize.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 9b4c7cba..298f3501 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -46,9 +46,18 @@ def process_node(self, node): elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): return # Verbs that are derived from the copula and tagged as the copula need - # to have the lemma of the copula (是 shì). - if re.search(r'是', node.form) and re.match(r'^(AUX|VERB)$', node.upos): - node.lemma = '是' + # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). + # 亦為 亦为 Yì wèi také + # 則為 则为 Zé wèi potom + # 更為 更为 Gèng wèi více + # 認為 认为 Rènwéi myslet, věřit + # 以為 以为 Yǐwéi myslet, věřit + # 以爲 以为 Yǐwéi myslet, věřit + m = re.search(r'([是爲為为])', node.form) + if m and re.match(r'^(AUX|VERB)$', node.upos): + node.lemma = m.group(1) + if node.lemma == '爲': + node.lemma = '為' if node.form == '不是': node.feats['Polarity'] = 'Neg' elif node.form in self.lemma: From d9af327a10bc816334d6e0514f636206dfb44c9f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 27 Nov 2022 01:44:04 +0100 Subject: [PATCH 311/871] readers' parameter merge=1 so e.g. `udapy read.Conllu files=a.connlu,b.conllu merge=1` merges the two files into one document and should be equivalent to `cat a.conllu b.conllu | udapy read.Conllu from=-`. --- udapi/core/basereader.py | 82 +++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 38 deletions(-) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 53a1129c..a3b334da 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -13,7 +13,7 @@ class BaseReader(Block): # pylint: disable=too-many-arguments def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', - sent_id_filter=None, split_docs=False, ignore_sent_id=False, **kwargs): + sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, **kwargs): super().__init__(**kwargs) if filehandle is not None: files = None @@ -28,6 +28,7 @@ def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, e logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id + self.merge = merge # `global.Entity` is a header stored in a comment before the first tree of each document in a given CoNLL-U file. # In Udapi, it is stored in `document.meta['global.Entity']`, but for technical reasons, we need to temporarily store it in here, the reader. # The reason is that `read.Conllu` uses a fast loading interface with `read_trees()`, @@ -111,43 +112,48 @@ def try_fast_load(self, document): if filehandle is None: self.finished = True return True - try: - trees = self.read_trees() - except NotImplementedError: - return False - document.meta['loaded_from'] = self.filename - document.meta['global.Entity'] = self._global_entity - if trees and trees[0].newdoc and trees[0].newdoc is not True: - document.meta["docname"] = trees[0].newdoc - - bundle, last_bundle_id = None, '' - for root in trees: - add_to_the_last_bundle = False - - if self.ignore_sent_id: - root._sent_id = None - elif root._sent_id is not None: - parts = root._sent_id.split('/', 1) - bundle_id = parts[0] - if len(parts) == 2: - root.zone = parts[1] - add_to_the_last_bundle = bundle_id == last_bundle_id - last_bundle_id = bundle_id - if self.zone != 'keep': - root.zone = self.zone - - # assign new/next bundle to `bundle` if needed - if not bundle or not add_to_the_last_bundle: - bundle = document.create_bundle() - if last_bundle_id != '': - bundle.bundle_id = last_bundle_id - - bundle.add_tree(root) - - self.next_filehandle() - if self.filehandle is None: - self.finished = True + while True: + try: + trees = self.read_trees() + except NotImplementedError: + return False + + document.meta['loaded_from'] = self.filename + document.meta['global.Entity'] = self._global_entity + if trees and trees[0].newdoc and trees[0].newdoc is not True: + document.meta["docname"] = trees[0].newdoc + + bundle, last_bundle_id = None, '' + for root in trees: + add_to_the_last_bundle = False + + if self.ignore_sent_id: + root._sent_id = None + elif root._sent_id is not None: + parts = root._sent_id.split('/', 1) + bundle_id = parts[0] + if len(parts) == 2: + root.zone = parts[1] + add_to_the_last_bundle = bundle_id == last_bundle_id + last_bundle_id = bundle_id + if self.zone != 'keep': + root.zone = self.zone + + # assign new/next bundle to `bundle` if needed + if not bundle or not add_to_the_last_bundle: + bundle = document.create_bundle() + if last_bundle_id != '': + bundle.bundle_id = last_bundle_id + + bundle.add_tree(root) + + self.next_filehandle() + if self.filehandle is None: + self.finished = True + return True + if not self.merge: + return True return True # pylint: disable=too-many-branches,too-many-statements @@ -190,7 +196,7 @@ def process_document(self, document): while True: root = self.filtered_read_tree() if root is None: - if trees_loaded == 0 and self.files.has_next_file(): + if (trees_loaded == 0 or self.merge) and self.files.has_next_file(): filehandle = self.next_filehandle() continue self.finished = not self.files.has_next_file() From e148621de92ea26550634d4972b6e0093660a103 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 17:11:40 +0100 Subject: [PATCH 312/871] Lemmatization of negated verbs in Chinese. --- udapi/block/ud/zh/lemmatize.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 298f3501..75d62716 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -45,6 +45,9 @@ def process_node(self, node): return elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): return + # Lemmatize negated verbs to their affirmative forms. + # 不是 bùshì = not be + # 没有 méiyǒu = not exist # Verbs that are derived from the copula and tagged as the copula need # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). # 亦為 亦为 Yì wèi také @@ -53,13 +56,16 @@ def process_node(self, node): # 認為 认为 Rènwéi myslet, věřit # 以為 以为 Yǐwéi myslet, věřit # 以爲 以为 Yǐwéi myslet, věřit - m = re.search(r'([是爲為为])', node.form) - if m and re.match(r'^(AUX|VERB)$', node.upos): - node.lemma = m.group(1) - if node.lemma == '爲': - node.lemma = '為' - if node.form == '不是': + if re.match(r'^(AUX|VERB)$', node.upos): + m1 = re.match(r'^(不|没)(.+)$', node.form) + m2 = re.search(r'([是爲為为])', node.form) + if m1: + node.lemma = m1.group(1) node.feats['Polarity'] = 'Neg' + elif m2: + node.lemma = m2.group(1) + if node.lemma == '爲': + node.lemma = '為' elif node.form in self.lemma: node.lemma = self.lemma[node.form] else: From 40f224a9e9554d9573d9059fb7aea16ea20a731a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 17:26:13 +0100 Subject: [PATCH 313/871] Oops! Wrong part! --- udapi/block/ud/zh/lemmatize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 75d62716..7658d9b4 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -60,7 +60,7 @@ def process_node(self, node): m1 = re.match(r'^(不|没)(.+)$', node.form) m2 = re.search(r'([是爲為为])', node.form) if m1: - node.lemma = m1.group(1) + node.lemma = m1.group(2) node.feats['Polarity'] = 'Neg' elif m2: node.lemma = m2.group(1) From 0e0d53905e40848c0e7a11e4d87aa3715a93ee33 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 17:35:42 +0100 Subject: [PATCH 314/871] =?UTF-8?q?Another=20negation=20pattern:=20?= =?UTF-8?q?=E6=9C=AA.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/zh/lemmatize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 7658d9b4..9c492800 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -48,6 +48,7 @@ def process_node(self, node): # Lemmatize negated verbs to their affirmative forms. # 不是 bùshì = not be # 没有 méiyǒu = not exist + # 未能 wèinéng = cannot # Verbs that are derived from the copula and tagged as the copula need # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). # 亦為 亦为 Yì wèi také @@ -57,7 +58,7 @@ def process_node(self, node): # 以為 以为 Yǐwéi myslet, věřit # 以爲 以为 Yǐwéi myslet, věřit if re.match(r'^(AUX|VERB)$', node.upos): - m1 = re.match(r'^(不|没)(.+)$', node.form) + m1 = re.match(r'^([不没未])(.+)$', node.form) m2 = re.search(r'([是爲為为])', node.form) if m1: node.lemma = m1.group(2) From 74445e4722de55a7e9714642def6fd09a1d5e2ae Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 17:50:50 +0100 Subject: [PATCH 315/871] Another negation pattern. --- udapi/block/ud/zh/lemmatize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 9c492800..436c3587 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -48,6 +48,7 @@ def process_node(self, node): # Lemmatize negated verbs to their affirmative forms. # 不是 bùshì = not be # 没有 méiyǒu = not exist + # 沒能 méinéng = cannot # 未能 wèinéng = cannot # Verbs that are derived from the copula and tagged as the copula need # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). @@ -58,7 +59,7 @@ def process_node(self, node): # 以為 以为 Yǐwéi myslet, věřit # 以爲 以为 Yǐwéi myslet, věřit if re.match(r'^(AUX|VERB)$', node.upos): - m1 = re.match(r'^([不没未])(.+)$', node.form) + m1 = re.match(r'^([不没沒未])(.+)$', node.form) m2 = re.search(r'([是爲為为])', node.form) if m1: node.lemma = m1.group(2) From 13088ed765f05f6f684595863a81a783e8ceafbb Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 18:18:26 +0100 Subject: [PATCH 316/871] Lemmatization of interrogative verbs in Chinese. --- udapi/block/ud/zh/lemmatize.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 436c3587..abacf29f 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -47,25 +47,32 @@ def process_node(self, node): return # Lemmatize negated verbs to their affirmative forms. # 不是 bùshì = not be - # 没有 méiyǒu = not exist - # 沒能 méinéng = cannot + # 沒有 没有 méiyǒu = not exist + # 沒能 没能 méinéng = cannot # 未能 wèinéng = cannot + # Lemmatize question verbs to their base forms. + # 要不要 yàobùyào = do (you) want? + # 有没有 yǒuméiyǒu = do (you) have? # Verbs that are derived from the copula and tagged as the copula need # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). - # 亦為 亦为 Yì wèi také - # 則為 则为 Zé wèi potom - # 更為 更为 Gèng wèi více - # 認為 认为 Rènwéi myslet, věřit - # 以為 以为 Yǐwéi myslet, věřit - # 以爲 以为 Yǐwéi myslet, věřit + # 亦為 亦为 yìwèi = také + # 則為 则为 zéwèi = potom + # 更為 更为 gèngwèi = více + # 認為 认为 rènwéi = myslet, věřit + # 以為 以为 yǐwéi = myslet, věřit + # 以爲 以为 yǐwéi = myslet, věřit if re.match(r'^(AUX|VERB)$', node.upos): m1 = re.match(r'^([不没沒未])(.+)$', node.form) - m2 = re.search(r'([是爲為为])', node.form) + m2 = re.match(r'^(.+)([不没沒未])\1$', node.form) + m3 = re.search(r'([是爲為为])', node.form) if m1: node.lemma = m1.group(2) node.feats['Polarity'] = 'Neg' elif m2: node.lemma = m2.group(1) + node.feats['Mood'] = 'Int' + elif m3: + node.lemma = m3.group(1) if node.lemma == '爲': node.lemma = '為' elif node.form in self.lemma: From 64f5bc7427efd3c32a84229e2c2c901b545118b4 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 1 Dec 2022 14:26:38 +0100 Subject: [PATCH 317/871] print also number of documents and paragraphs if any, based on newdoc and newpar annotations --- udapi/block/util/wc.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/udapi/block/util/wc.py b/udapi/block/util/wc.py index 137c95e9..e8ea2676 100644 --- a/udapi/block/util/wc.py +++ b/udapi/block/util/wc.py @@ -13,6 +13,7 @@ def __init__(self, tsv=False, **kwargs): """ super().__init__(**kwargs) self.trees, self.words, self.mwts, self.tokens, self.empty = 0, 0, 0, 0, 0 + self.docs, self.paragraphs = 0, 0 self.tsv = tsv def process_tree(self, tree): @@ -22,13 +23,21 @@ def process_tree(self, tree): self.mwts += mwtoks self.tokens += len(tree.token_descendants) if mwtoks else len(tree.descendants) self.empty += len(tree.empty_nodes) + if tree.newdoc: + self.docs += 1 + if tree.newpar: + self.paragraphs += 1 def process_end(self): if self.tsv: - print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty)))) + print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty, self.docs, self.paragraphs)))) else: print('%8d trees\n%8d words' % (self.trees, self.words)) if self.mwts: print('%8d multi-word tokens\n%8d tokens' % (self.mwts, self.tokens)) if self.empty: print('%8d empty nodes' % self.empty) + if self.docs: + print('%8d documents' % self.docs) + if self.paragraphs: + print('%8d paragraphs' % self.paragraphs) From c29590fefe4a045c8c33c0e8729c3a2582d1cf5f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 2 Dec 2022 23:18:53 +0100 Subject: [PATCH 318/871] Enable separate checking of Flavio's approach to Latin morphology. --- udapi/block/ud/la/markfeatsbugs.py | 141 +++++++++++++++-------------- 1 file changed, 75 insertions(+), 66 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 96c7b682..149fcd18 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -12,15 +12,32 @@ class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + def __init__(self, flavio=False, **kwargs): + """ + Create the ud.la.MarkFeatsBugs block instance. + + Args: + flavio=1: Accept features as defined by Flavio for treebanks he + maintains. By default, a more conservative set of features and + values is expected. + """ + super().__init__(**kwargs) + self.flavio = flavio + def process_node(self, node): # NOUNS ################################################################ if node.upos == 'NOUN': - self.check_required_features(node, ['Gender', 'Number', 'Case']) - self.check_allowed_features(node, { + rf = ['Gender', 'Number', 'Case'] + af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes']} + if self.flavio: + rf.append('InflClass') + af['InflClass'] = ['IndEurA', 'IndEurO', 'IndEurX'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': self.check_required_features(node, ['Gender', 'Number', 'Case']) @@ -32,13 +49,20 @@ def process_node(self, node): 'Foreign': ['Yes']}) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree']) - self.check_allowed_features(node, { + rf = ['Gender', 'Number', 'Case', 'Degree'] + af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Foreign': ['Yes']}) + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Foreign': ['Yes']} + if self.flavio: + # Flavio does not use Degree=Pos, hence Degree is not required. + rf = [f for f in rf if f != 'Degree'] + rf.append('InflClass') + af['InflClass'] = ['IndEurA', 'IndEurO', 'IndEurX'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # PRONOUNS ############################################################# elif node.upos == 'PRON': self.check_required_features(node, ['PronType']) @@ -81,13 +105,19 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] }) else: - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { + rf = ['PronType', 'Gender', 'Number', 'Case'] + af = { 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - }) + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl']} + if self.flavio: + rf.append('InflClass') + af['PronType'].append('Con') + af['InflClass'] = ['LatPron'] + af['Form'] = ['Emp'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # NUMERALS ############################################################# elif node.upos == 'NUM': self.check_required_features(node, ['NumType', 'NumForm']) @@ -98,73 +128,52 @@ def process_node(self, node): 'NumForm': ['Digit', 'Roman'] }) else: - self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_required_features(node, ['NumType', 'NumForm']) self.check_allowed_features(node, { 'NumType': ['Card'], - 'NumForm': ['Word'], - 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + 'NumForm': ['Word'] }) # VERBS AND AUXILIARIES ################################################ elif re.match(r'^(VERB|AUX)$', node.upos): - self.check_required_features(node, ['Aspect', 'VerbForm']) - if node.feats['VerbForm'] == 'Inf': - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Inf'], - 'Polarity': ['Pos', 'Neg'] - }) - elif node.feats['VerbForm'] == 'Fin': - if node.feats['Mood'] == 'Imp': - self.check_required_features(node, ['Mood', 'Person', 'Number']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Fin'], - 'Mood': ['Imp'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Plur'], - 'Polarity': ['Pos', 'Neg'] - }) - else: # indicative or subjunctive - self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Sub'], - 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative - 'Voice': ['Act'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Plur'], - 'Polarity': ['Pos', 'Neg'] - }) + rf = ['Aspect', 'VerbForm'] + af = { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Inf', 'Fin', 'Part', 'Vnoun'], + 'Polarity': ['Pos', 'Neg']} + if node.feats['VerbForm'] == 'Fin': + rf.extend(['Mood', 'Person', 'Number']) + af['Mood'] = ['Ind', 'Sub', 'Imp'] + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + if re.match(r'^(Ind|Sub)$', node.feats['Mood']): # indicative or subjunctive + rf.extend(['Voice', 'Tense']) + af['Voice'] = ['Act', 'Pass'] + af['Tense'] = ['Past', 'Imp', 'Pres', 'Fut'] elif node.feats['VerbForm'] == 'Part': - self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Part'], - 'Tense': ['Past'], - 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB - 'Number': ['Sing', 'Plur'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Polarity': ['Pos', 'Neg'] - }) + rf.extend(['Tense', 'Gender', 'Number', 'Voice']) + af['Tense'] = ['Past'] + af['Voice'] = ['Act'] + af['Number'] = ['Sing', 'Plur'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] else: # verbal noun - self.check_required_features(node, ['Tense', 'Number', 'Voice']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Vnoun'], - 'Tense': ['Past', 'Pres'], - 'Voice': ['Act'], - 'Number': ['Sing', 'Plur'], - 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular - 'Polarity': ['Pos', 'Neg'] - }) + rf.extend(['Tense', 'Voice']) + af['Tense'] = ['Past', 'Pres'] + af['Voice'] = ['Act'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] + if self.flavio: + # Flavio has killed Tense in his treebanks. + rf = [f for f in rf if f != 'Tense'] + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI2', 'LatX'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # ADVERBS ############################################################## elif node.upos == 'ADV': if node.feats['PronType'] != '': # Pronominal adverbs are neither compared nor negated. self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'AdvType': ['Loc'] }) else: # The remaining adverbs are neither pronominal, nor compared or From 8b05a49741481d20cf4b0b4ec41bf92b4a696701 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 3 Dec 2022 12:36:35 +0100 Subject: [PATCH 319/871] Adjusted Latin feature rules. --- udapi/block/ud/la/markfeatsbugs.py | 209 +++++++++++++++++------------ 1 file changed, 121 insertions(+), 88 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 149fcd18..31d112b8 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -25,121 +25,146 @@ def __init__(self, flavio=False, **kwargs): self.flavio = flavio def process_node(self, node): + rf = [] + af = {} # NOUNS ################################################################ if node.upos == 'NOUN': - rf = ['Gender', 'Number', 'Case'] + if not node.feats['Abbr'] == 'Yes': + rf = ['Gender', 'Number', 'Case'] af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Dim'], + 'Abbr': ['Yes'], 'Foreign': ['Yes']} if self.flavio: - rf.append('InflClass') - af['InflClass'] = ['IndEurA', 'IndEurO', 'IndEurX'] + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': - self.check_required_features(node, ['Gender', 'Number', 'Case']) - self.check_allowed_features(node, { + if not node.feats['Abbr'] == 'Yes': + rf = ['Gender', 'Number', 'Case'] + af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) + 'Abbr': ['Yes'], + 'Foreign': ['Yes']} + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['Proper'] = ['Yes'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': - rf = ['Gender', 'Number', 'Case', 'Degree'] + if not node.feats['Abbr'] == 'Yes': + rf = ['Gender', 'Number', 'Case', 'Degree'] af = { + 'NumType': ['Ord', 'Dist'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Abbr': ['Yes'], 'Foreign': ['Yes']} if self.flavio: # Flavio does not use Degree=Pos, hence Degree is not required. rf = [f for f in rf if f != 'Degree'] - rf.append('InflClass') - af['InflClass'] = ['IndEurA', 'IndEurO', 'IndEurX'] + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['Compound'] = ['Yes'] + af['Proper'] = ['Yes'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # PRONOUNS ############################################################# elif node.upos == 'PRON': - self.check_required_features(node, ['PronType']) + rf = ['PronType', 'Case'] + af = { + 'PronType': ['Prs', 'Rel', 'Ind'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + } if node.feats['PronType'] == 'Prs': - if node.feats['Reflex'] == 'Yes': - self.check_required_features(node, ['PronType', 'Reflex', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Reflex': ['Yes'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Abl'] - }) - else: # not reflexive - if node.feats['Person'] == '3': # on, ona, ono, oni, ony - self.check_required_features(node, ['PronType', 'Person', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - }) - else: # 1st and 2nd person do not have gender: já, ty - self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['1', '2'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - }) + af['Reflex'] = ['Yes'] + if node.feats['Reflex'] == 'Yes': # seipsum, se + # seipsum has gender and number but se does not, so it is not required + af['Gender'] = ['Masc'] + af['Number'] = ['Sing'] + af['Person'] = ['3'] + af['Case'] = ['Gen', 'Dat', 'Acc', 'Loc', 'Abl'] + else: # not reflexive: ego, tu, is, nos + rf.extend(['Person', 'Number']) + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + # 1st and 2nd person do not have gender + if node.feats['Person'] == '3': # is, id + rf.append('Gender') + af['Gender'] = ['Masc', 'Fem', 'Neut'] + elif re.match(r'^(Rel|Ind)$', node.feats['PronType']): + rf.extend(['Gender', 'Number']) + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['LatAnom', 'LatPron'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # DETERMINERS ########################################################## elif node.upos == 'DET': - if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Person': ['1', '2', '3'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - }) + rf = ['PronType', 'Gender', 'Number', 'Case'] + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl']} + if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' + rf.extend(['Poss', 'Person[psor]']) + af['PronType'] = ['Prs'] + af['Poss'] = 'Yes' + af['Person[psor]'] = ['1', '2', '3'] + af['Reflex'] = ['Yes'] + # The possessor's number is distinguished in the first and second person (meus vs. noster) but not in the third person (suus). + if node.feats['Person[psor]'] != '3': + rf.append('Number[psor]') + af['Number[psor]'] = ['Sing', 'Plur'] else: - rf = ['PronType', 'Gender', 'Number', 'Case'] - af = { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl']} - if self.flavio: - rf.append('InflClass') - af['PronType'].append('Con') - af['InflClass'] = ['LatPron'] - af['Form'] = ['Emp'] - self.check_required_features(node, rf) - self.check_allowed_features(node, af) + af['PronType'] = ['Dem', 'Rel', 'Ind', 'Tot', 'Con'] + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['Form'] = ['Emp'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # NUMERALS ############################################################# elif node.upos == 'NUM': - self.check_required_features(node, ['NumType', 'NumForm']) + rf = ['NumType', 'NumForm'] + af = { + 'NumType': ['Card'], + 'NumForm': ['Word', 'Roman', 'Digit'] + } # Arabic digits and Roman numerals do not have inflection features. - if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Digit', 'Roman'] - }) - else: - self.check_required_features(node, ['NumType', 'NumForm']) - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'] - }) + if not re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # VERBS AND AUXILIARIES ################################################ elif re.match(r'^(VERB|AUX)$', node.upos): - rf = ['Aspect', 'VerbForm'] + rf = ['VerbForm'] af = { - 'Aspect': ['Imp', 'Perf', 'Prosp'], 'VerbForm': ['Inf', 'Fin', 'Part', 'Vnoun'], 'Polarity': ['Pos', 'Neg']} + # Main verbs have aspect but auxiliaries don't. + if node.upos == 'VERB': + rf.append('Aspect') + af['Aspect'] = ['Imp', 'Perf', 'Prosp'] if node.feats['VerbForm'] == 'Fin': rf.extend(['Mood', 'Person', 'Number']) af['Mood'] = ['Ind', 'Sub', 'Imp'] @@ -150,40 +175,48 @@ def process_node(self, node): af['Voice'] = ['Act', 'Pass'] af['Tense'] = ['Past', 'Imp', 'Pres', 'Fut'] elif node.feats['VerbForm'] == 'Part': - rf.extend(['Tense', 'Gender', 'Number', 'Voice']) + rf.extend(['Tense', 'Gender', 'Number', 'Voice', 'Case']) af['Tense'] = ['Past'] - af['Voice'] = ['Act'] + af['Voice'] = ['Act', 'Pass'] af['Number'] = ['Sing', 'Plur'] af['Gender'] = ['Masc', 'Fem', 'Neut'] - else: # verbal noun + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + af['Degree'] = ['Abs'] + elif node.feats['VerbForm'] == 'Vnoun': rf.extend(['Tense', 'Voice']) af['Tense'] = ['Past', 'Pres'] - af['Voice'] = ['Act'] + af['Voice'] = ['Act', 'Pass'] af['Gender'] = ['Masc', 'Fem', 'Neut'] + # else: nothing to be added form VerbForm=Inf if self.flavio: # Flavio has killed Tense in his treebanks. rf = [f for f in rf if f != 'Tense'] # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI2', 'LatX'] + if node.feats['VerbForm'] == 'Part': + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## elif node.upos == 'ADV': - if node.feats['PronType'] != '': - # Pronominal adverbs are neither compared nor negated. - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], - 'AdvType': ['Loc'] - }) - else: - # The remaining adverbs are neither pronominal, nor compared or - # negated. - self.check_allowed_features(node, {}) + af = { + 'AdvType': ['Loc', 'Tim'], + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'] + } + if self.flavio: + af['Compound'] = 'Yes' + af['Form'] = 'Emp' + self.check_allowed_features(node, af) # PARTICLES ############################################################ elif node.upos == 'PART': - self.check_allowed_features(node, { + af = { + 'PartType': ['Int'], 'Polarity': ['Neg'] - }) + } + if self.flavio: + af['Form'] = 'Emp' + self.check_allowed_features(node, af) # THE REST: NO FEATURES ################################################ else: self.check_allowed_features(node, {}) From ab86f1b93d6e20bf4f42c18c1af9f3c22c5e4f64 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 4 Dec 2022 12:07:10 +0100 Subject: [PATCH 320/871] Refined features of pronouns in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 71 +++++++++++++----------------- 1 file changed, 30 insertions(+), 41 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 41d4cf09..96cf8b55 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -27,50 +27,38 @@ def process_node(self, node): 'Foreign': ['Yes']}) # PRONOUNS ############################################################# elif node.upos == 'PRON': - self.check_required_features(node, ['PronType']) + rf = ['PronType', 'Case'] + af = { + 'PronType': ['Prs', 'Int'], # demonstrative pronouns are treated as third person personal pronouns + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + } if node.feats['PronType'] == 'Prs': + af['Reflex'] = ['Yes'] if node.feats['Reflex'] == 'Yes': - self.check_required_features(node, ['PronType', 'Reflex', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Reflex': ['Yes'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) + af['Case'] = [c for c in af['Case'] if c != 'Nom' and c != 'Voc'] else: # not reflexive - if node.feats['Person'] == '3': # അവൻ avan, അവൾ avaḷ, അത് at, അവർ avaṟ + rf.extend(['Person', 'Number']) + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + # 1st and 2nd person do not have gender: ഞാൻ ñān, നീ nī; or 3rd person താൻ tān̕ + if node.feats['Person'] == '3' and not node.lemma == 'താൻ': # അവൻ avan, അവൾ avaḷ, അത് at, അവർ avaṟ; but not താൻ tān̕ + rf.append('Deixis') + af['Deixis'] = ['Prox', 'Remt'] if node.feats['Number'] == 'Sing': - self.check_required_features(node, ['PronType', 'Person', 'Deixis', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Deixis': ['Prox', 'Remt'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) - else: # plural pronouns do not distinguish gender - self.check_required_features(node, ['PronType', 'Person', 'Deixis', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Deixis': ['Prox', 'Remt'], - 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) - else: # 1st and 2nd person do not have gender: ഞാൻ ñān, നീ nī - self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['1', '2'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) - else: # not personal - self.check_required_features(node, ['PronType', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) + rf.append('Gender') + af['Gender'] = ['Masc', 'Fem', 'Neut'] + # third person singular neuter pronouns also distinguish animacy (animate neuter are animals and plants, they have a different accusative form) + if node.feats['Gender'] == 'Neut': + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] + else: # plural pronouns do not distinguish gender but they do distinguish animacy + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] + elif node.feats['Person'] == '1' and node.feats['Number'] == 'Plur': + rf.append('Clusivity') + af['Clusivity'] = ['In', 'Ex'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # DETERMINERS ########################################################## elif node.upos == 'DET': if node.feats['PronType'] == 'Art': @@ -82,7 +70,8 @@ def process_node(self, node): else: self.check_required_features(node, ['PronType']) self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'Deixis': ['Prox', 'Remt'] }) # NUMERALS ############################################################# elif node.upos == 'NUM': From cd9b962cb602eced89466af00ee077afd20d63bc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 4 Dec 2022 15:04:02 +0100 Subject: [PATCH 321/871] Write sentences in a HTML list. --- udapi/block/write/sentenceshtml.py | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 udapi/block/write/sentenceshtml.py diff --git a/udapi/block/write/sentenceshtml.py b/udapi/block/write/sentenceshtml.py new file mode 100644 index 00000000..e0f87241 --- /dev/null +++ b/udapi/block/write/sentenceshtml.py @@ -0,0 +1,37 @@ +"""SentencesHtml class is a writer for sentences in HTML list (could be Google-translated, remembering sentence correspondence).""" +from udapi.core.basewriter import BaseWriter + + +class SentencesHtml(BaseWriter): + """A writer of sentences in HTML list (one per item). + + Usage: + udapy write.SentencesHtml if_missing=empty < my.conllu > my.html + """ + + def __init__(self, title='Sentences from CoNLL-U', if_missing='detokenize', **kwargs): + """Create the SentencesHtml writer block. + + Parameters: + if_missing: What to do if `root.text` is `None`? (default=detokenize) + * `detokenize`: use `root.compute_text()` to compute the sentence. + * `empty`: print an empty line + * `warn_detokenize`, `warn_empty`: in addition emit a warning via `logging.warning()` + * `fatal`: raise an exception + """ + super().__init__(**kwargs) + self.title = title + self.if_missing = if_missing + + def before_process_document(self, document): + super().before_process_document(document) + print('\n\n\n') + print('' + self.title + '') + print('\n\n

    \n') + + def after_process_document(self, document): + print("
\n\n") + super().after_process_document(document) + + def process_tree(self, tree): + print('
  • %s
  • ' % (tree.sent_id, tree.get_sentence(self.if_missing))) From faeecb50ca7c3dfbc1130c628593c9b7031f035e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 4 Dec 2022 19:30:44 +0100 Subject: [PATCH 322/871] Refined feature tests for Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 96cf8b55..2372bd23 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -57,6 +57,15 @@ def process_node(self, node): elif node.feats['Person'] == '1' and node.feats['Number'] == 'Plur': rf.append('Clusivity') af['Clusivity'] = ['In', 'Ex'] + # Interrogative pronouns, too, can be case-marked. Therefore, the + # base form must have Case=Nom. + # ആര് ār "who" (Nom) എന്ത് ent "what" (Nom, Acc.Inan) + # ആരെ āre "who" (Acc) എന്തെ ente "what" (Acc.Anim) എന്തിനെ entine "what" (Acc.Anim or maybe Inan but optional) + # ആരുടെ āruṭe "who" (Gen) എന്തിന് entin "what" (Gen) or "why" + # ആരൊക്കെ ārokke "who" (Dat?) എന്തൊക്കെ entokke "what" (Dat?) + elif node.feats['PronType'] == 'Int': + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # DETERMINERS ########################################################## @@ -101,13 +110,18 @@ def process_node(self, node): }) elif node.feats['VerbForm'] == 'Fin': if node.feats['Mood'] == 'Imp': - self.check_required_features(node, ['Mood', 'Voice']) + # Unlike other forms, the imperative distinguishes politeness. + # The verb stem serves as an informal imperative: തുറ tuṟa "open" + # The citation form may serve as a formal imperative: തുറക്കുക tuṟakkūka "open" + # Finally, there is another formal imperative with -kkū: തുറക്കൂ tuṟakkū "open" + self.check_required_features(node, ['Mood', 'Voice', 'Polite']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], 'Mood': ['Imp'], 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'] + 'Voice': ['Act', 'Pass', 'Cau'], + 'Polite': ['Infm', 'Form'] }) else: self.check_required_features(node, ['Mood', 'Tense', 'Voice']) From 9f1c9adadd6b5e53aa9cf5aaea9cd8e26cdfe663 Mon Sep 17 00:00:00 2001 From: "Federica Gamba (PhD" Date: Tue, 6 Dec 2022 10:24:48 +0100 Subject: [PATCH 323/871] Further adjusted Latin feature rules. --- udapi/block/ud/la/markfeatsbugs.py | 155 ++++++++++++++++++++--------- 1 file changed, 107 insertions(+), 48 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 31d112b8..323f60f7 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -29,7 +29,7 @@ def process_node(self, node): af = {} # NOUNS ################################################################ if node.upos == 'NOUN': - if not node.feats['Abbr'] == 'Yes': + if not node.feats['Abbr'] == 'Yes' or node.feats['Case']: # abbreviated or indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { 'Gender': ['Masc', 'Fem', 'Neut'], @@ -41,61 +41,71 @@ def process_node(self, node): if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['VerbForm'] = ['Part'] + af['Proper'] = ['Yes'] + af['Compound'] = ['Yes'] + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': - if not node.feats['Abbr'] == 'Yes': + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: # abbreviated and indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], - 'NameType': ['Giv', 'Sur', 'Geo'], 'Abbr': ['Yes'], 'Foreign': ['Yes']} if self.flavio: - # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] - af['Proper'] = ['Yes'] + af['Compound'] = 'Yes' + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: + af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': - if not node.feats['Abbr'] == 'Yes': - rf = ['Gender', 'Number', 'Case', 'Degree'] + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: + rf = ['Gender', 'Number', 'Case'] af = { 'NumType': ['Ord', 'Dist'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], - 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Degree': ['Cmp', 'Sup', 'Abs'], 'Abbr': ['Yes'], - 'Foreign': ['Yes']} + 'Foreign': ['Yes'], + 'Polarity': ['Neg']} if self.flavio: # Flavio does not use Degree=Pos, hence Degree is not required. - rf = [f for f in rf if f != 'Degree'] + # rf = [f for f in rf if f != 'Degree'] # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Compound'] = ['Yes'] + af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] + af['Degree'].append('Dim') + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # PRONOUNS ############################################################# elif node.upos == 'PRON': rf = ['PronType', 'Case'] af = { - 'PronType': ['Prs', 'Rel', 'Ind'], + 'PronType': ['Prs', 'Rel', 'Ind', 'Int', 'Rcp'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] } if node.feats['PronType'] == 'Prs': af['Reflex'] = ['Yes'] if node.feats['Reflex'] == 'Yes': # seipsum, se + rf.extend(['Person']) # seipsum has gender and number but se does not, so it is not required - af['Gender'] = ['Masc'] - af['Number'] = ['Sing'] + # TODO: seipsum in ITTB, but why lemma seipsum instead of seipse? + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] af['Person'] = ['3'] - af['Case'] = ['Gen', 'Dat', 'Acc', 'Loc', 'Abl'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Abl'] else: # not reflexive: ego, tu, is, nos rf.extend(['Person', 'Number']) af['Person'] = ['1', '2', '3'] @@ -104,22 +114,34 @@ def process_node(self, node): if node.feats['Person'] == '3': # is, id rf.append('Gender') af['Gender'] = ['Masc', 'Fem', 'Neut'] - elif re.match(r'^(Rel|Ind)$', node.feats['PronType']): + elif re.match(r'^(Rel|Int)$', node.feats['PronType']): rf.extend(['Gender', 'Number']) af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Number'] = ['Sing', 'Plur'] + elif node.feats['PronType'] == 'Ind': + rf = [f for f in rf if f != 'Case'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatAnom', 'LatPron'] + af['Compound'] = ['Yes'] + af['Polarity'] = ['Neg'] + af['Form'] = ['Emp'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # DETERMINERS ########################################################## elif node.upos == 'DET': - rf = ['PronType', 'Gender', 'Number', 'Case'] + rf = ['PronType'] + if node.feats['Case']: + rf.extend(['Gender', 'Number', 'Case']) af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl']} + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Cmp', 'Abs', 'Sup'], + 'Polarity': ['Neg'] + } if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' rf.extend(['Poss', 'Person[psor]']) af['PronType'] = ['Prs'] @@ -131,11 +153,16 @@ def process_node(self, node): rf.append('Number[psor]') af['Number[psor]'] = ['Sing', 'Plur'] else: - af['PronType'] = ['Dem', 'Rel', 'Ind', 'Tot', 'Con'] + af['PronType'] = ['Dem', 'Rel', 'Ind', 'Int', 'Tot', 'Con'] if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] + af['Compound'] = ['Yes'] af['Form'] = ['Emp'] + af['NumType'] = ['Card'] + af['Degree'].append('Dim') + if re.match(r'^(unus|ambo)', node.lemma): + af['NumValue'] = ['1', '2'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # NUMERALS ############################################################# @@ -151,50 +178,59 @@ def process_node(self, node): af['Number'] = ['Sing', 'Plur'] af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] if self.flavio: - # Flavio added InflClass but not everywhere, so it is not required. + # Flavio added InflClass but not everywhere, so it is not required. # e.g. duodecim af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # VERBS AND AUXILIARIES ################################################ elif re.match(r'^(VERB|AUX)$', node.upos): - rf = ['VerbForm'] + rf = ['VerbForm', 'Aspect'] af = { - 'VerbForm': ['Inf', 'Fin', 'Part', 'Vnoun'], - 'Polarity': ['Pos', 'Neg']} + 'VerbForm': ['Inf', 'Fin', 'Part'], + 'Aspect': ['Imp', 'Inch', 'Perf', 'Prosp'], + 'Polarity': ['Neg'] + } + if not re.match(r'^(Ger|Gdv)$', node.feats['VerbForm']): + rf.append('Tense') + af['Tense'] = ['Pres', 'Fut'] + if node.upos == 'VERB': # and not node.lemma.endswith('sum'): # compounds of sum + rf.append('Voice') + af['Voice'] = ['Act', 'Pass'] # Main verbs have aspect but auxiliaries don't. - if node.upos == 'VERB': - rf.append('Aspect') - af['Aspect'] = ['Imp', 'Perf', 'Prosp'] - if node.feats['VerbForm'] == 'Fin': + # TODO: apparently, apparently AUXs have aspect as well + # if node.upos == 'VERB': + # rf.append('Aspect') + # af['Aspect'] = ['Imp', 'Inch', 'Perf', 'Prosp'] + if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive rf.extend(['Mood', 'Person', 'Number']) + af['Tense'].extend(['Past', 'Pqp']) af['Mood'] = ['Ind', 'Sub', 'Imp'] af['Person'] = ['1', '2', '3'] af['Number'] = ['Sing', 'Plur'] - if re.match(r'^(Ind|Sub)$', node.feats['Mood']): # indicative or subjunctive - rf.extend(['Voice', 'Tense']) - af['Voice'] = ['Act', 'Pass'] - af['Tense'] = ['Past', 'Imp', 'Pres', 'Fut'] elif node.feats['VerbForm'] == 'Part': - rf.extend(['Tense', 'Gender', 'Number', 'Voice', 'Case']) - af['Tense'] = ['Past'] - af['Voice'] = ['Act', 'Pass'] + rf.extend(['Gender', 'Number', 'Case']) af['Number'] = ['Sing', 'Plur'] af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - af['Degree'] = ['Abs'] - elif node.feats['VerbForm'] == 'Vnoun': - rf.extend(['Tense', 'Voice']) - af['Tense'] = ['Past', 'Pres'] - af['Voice'] = ['Act', 'Pass'] + af['Degree'] = ['Abs', 'Cmp'] af['Gender'] = ['Masc', 'Fem', 'Neut'] - # else: nothing to be added form VerbForm=Inf + af['Tense'].append('Past') + # else: nothing to be added for VerbForm=Inf if self.flavio: # Flavio has killed Tense in his treebanks. rf = [f for f in rf if f != 'Tense'] + af['VerbForm'].append('Vnoun') # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI2', 'LatX'] - if node.feats['VerbForm'] == 'Part': + af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] + if 'Degree' in af: + af['Degree'].append('Dim') + else: + af['Degree'] = ['Dim'] + af['Compound'] = ['Yes'] + af['Proper'] = ['Yes'] + if re.match(r'^(Part|Vnoun)$', node.feats['VerbForm']): af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO'] + af['VerbForm'].append('Vnoun') self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## @@ -202,20 +238,43 @@ def process_node(self, node): af = { 'AdvType': ['Loc', 'Tim'], 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], - 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'] + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Polarity': ['Neg'] } if self.flavio: - af['Compound'] = 'Yes' - af['Form'] = 'Emp' + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['NumType'] = ['Card', 'Ord'] # e.g., primum + af['VerbForm'] = ['Part'] + af['Degree'].append('Dim') self.check_allowed_features(node, af) # PARTICLES ############################################################ elif node.upos == 'PART': af = { - 'PartType': ['Int'], + 'PartType': ['Int', 'Emp'], 'Polarity': ['Neg'] } if self.flavio: - af['Form'] = 'Emp' + af['Form'] = ['Emp'] + af['PronType'] = ['Dem'] + self.check_allowed_features(node, af) + # CONJUNCTIONS ######################################################### + elif re.match(r'^[CS]CONJ$', node.upos): + af = { + 'PronType': ['Rel', 'Con'], + 'Polarity': ['Neg']} + if self.flavio: + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['VerbForm'] = ['Fin'] + af['NumType'] = ['Card'] + self.check_allowed_features(node, af) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + if self.flavio: + af = { + 'VerbForm': ['Part'], + 'Proper': ['Yes']} self.check_allowed_features(node, af) # THE REST: NO FEATURES ################################################ else: From 3de5c225d9fc8e1a56922bd13b2feea8f4ca7bf4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 6 Dec 2022 11:21:33 +0100 Subject: [PATCH 324/871] Usage: the new parameter merge=1 implemented by Martin. --- udapi/block/ud/cs/markfeatsbugs.py | 2 +- udapi/block/ud/la/markfeatsbugs.py | 2 +- udapi/block/ud/ml/markfeatsbugs.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index 309e7ac8..30ee90b2 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -4,7 +4,7 @@ used in filters and highlighted in text output. Usage: cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html -Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs import logging diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 31d112b8..74a06a07 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -4,7 +4,7 @@ used in filters and highlighted in text output. Usage: cat *.conllu | udapy -HAMX layout=compact ud.la.MarkFeatsBugs > bugs.html -Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs import logging diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 2372bd23..b286a27c 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -4,7 +4,7 @@ used in filters and highlighted in text output. Usage: cat *.conllu | udapy -HAMX layout=compact ud.ml.MarkFeatsBugs > bugs.html -Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.ml.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.ml.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs import logging From 1544c7474cdf91aa2f1c52b3566dedf11a127e5f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 14 Dec 2022 17:17:00 +0100 Subject: [PATCH 325/871] update for newer versions of termcolor and colorama --- requirements.txt | 2 +- udapi/block/write/textmodetrees.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 647361f7..044d3af7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -colorama +colorama>=0.4.6 termcolor ufal.udpipe diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index f3f6e007..41539670 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -1,4 +1,5 @@ """An ASCII pretty printer of dependency trees.""" +import os import re import sys @@ -344,8 +345,12 @@ def before_process_document(self, document): super().before_process_document(document) if self.color == 'auto': self.color = sys.stdout.isatty() - if self.color: - colorama.init() + if self.color: + colorama.just_fix_windows_console() + # termcolor since 2.1 also autodetects whether sys.stdout.isatty() + # and if not, it disables the colors, so `cat i.conllu | udapy -T | less -R" + # does not work. We need to turn off termcolor's autodetection with FORCE_COLOR. + os.environ["FORCE_COLOR"] = "1" if self.print_doc_meta: for key, value in sorted(document.meta.items()): print('%s = %s' % (key, value)) From 9b0d20115a4dfea531519bf54f8fe5326ac77261 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 16 Dec 2022 23:03:43 +0100 Subject: [PATCH 326/871] read.Sentences newdoc_if_empty_line=1 --- udapi/block/read/sentences.py | 14 ++++++++++++-- udapi/core/document.py | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/udapi/block/read/sentences.py b/udapi/block/read/sentences.py index 356e196f..9b428331 100644 --- a/udapi/block/read/sentences.py +++ b/udapi/block/read/sentences.py @@ -9,6 +9,8 @@ class Sentences(BaseReader): Args: ignore_empty_lines: if True, delete empty lines from the input. Default=False. + newdoc_if_empty_line: if True, empty lines mark document boundaries, + which are marked with `root.newdoc`. Default=False. rstrip: a set of characters to be stripped from the end of each line. Default='\r\n '. You can use rstrip='\n' if you want to preserve any space or '\r' (Carriage Return) at end of line, @@ -16,8 +18,12 @@ class Sentences(BaseReader): As most blocks do not expect whitespace other than a space to appear in the processed text, using this feature is at your own risk. """ - def __init__(self, ignore_empty_lines=False, rstrip='\r\n ', **kwargs): + def __init__(self, ignore_empty_lines=False, newdoc_if_empty_line=False, + rstrip='\r\n ', **kwargs): + if ignore_empty_lines and newdoc_if_empty_line: + raise ValueError("ignore_empty_lines is not compatible with newdoc_if_empty_line") self.ignore_empty_lines = ignore_empty_lines + self.newdoc_if_empty_line = newdoc_if_empty_line self.rstrip = rstrip super().__init__(**kwargs) @@ -38,11 +44,15 @@ def read_tree(self, document=None): # (or '\r\n' if reading a Windows file on Unix machine). if line == '': return None - if self.ignore_empty_lines: + preceded_by_empty_line = False + if self.ignore_empty_lines or self.newdoc_if_empty_line: while line in {'\n', '\r\n'}: + preceded_by_empty_line = True line = self.filehandle.readline() if line == '': return None root = Root() root.text = line.rstrip(self.rstrip) + if self.newdoc_if_empty_line and preceded_by_empty_line: + root.newdoc = True return root diff --git a/udapi/core/document.py b/udapi/core/document.py index dcf146ea..d6a84f0e 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -23,7 +23,7 @@ def __init__(self, filename=None, **kwargs): No pre-processing is applied, so when loading the document from a *.txt file, `Document("a.txt").nodes` will be empty and you need to run tokenization first. You can pass additional parameters for `udapi.block.read.sentences` - (`ignore_empty_lines` and `rstrip`). + (`ignore_empty_lines`, `newdoc_if_empty_line` and `rstrip`). """ self.bundles = [] self._highest_bundle_id = 0 From 83989865bf94f3ae9355364f05ac32aef84e8979 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 20 Dec 2022 11:12:47 +0100 Subject: [PATCH 327/871] bugfix logging.warning takes multiple *args to be substituted for %s, not a single argument, see https://docs.python.org/3/library/logging.html#logging.debug However, using f-strings seems to be less error-prone. --- udapi/block/read/conllu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index d703fb26..7e59e2f9 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -73,7 +73,7 @@ def parse_comment_line(self, line, root): if entity_match is not None: global_entity = entity_match.group(1) if self._global_entity and self._global_entity != global_entity: - logging.warning("Mismatch in global.Entity: %s != %s", (self._global_entity, global_entity)) + logging.warning(f"Mismatch in global.Entity: {self._global_entity} != {global_entity}") self._global_entity = global_entity root.comment += '$GLOBAL.ENTITY\n' return From f93d4c92a64b9aad8bcdf1d2a8045bc6ae554cc5 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 9 Jan 2023 21:04:47 +0100 Subject: [PATCH 328/871] fix a bug preventing to load two conllu files into two zones BaseReader calls ``` if self.zone != 'keep': root.zone = self.zone ``` so it supposes that root.sent_id will reflect the new zone. Originally, `root.sent_id` was computed each time on the fly, but after optimization it is cached in `root._sent_id`. --- udapi/core/root.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/core/root.py b/udapi/core/root.py index 0132566a..6a5717a2 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -95,6 +95,12 @@ def zone(self, zone): if self._bundle: self._bundle.check_zone(zone) self._zone = zone + if self._bundle is not None: + self._sent_id = self._bundle.address() + '/' + zone + elif self._sent_id: + self._sent_id = self._sent_id.split('/', 1)[0] + '/' + zone + else: + self._sent_id = '?/' + zone @property def parent(self): From 187a2b20139a60c0ca3ad8f08325b3851a695e86 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 9 Jan 2023 21:08:59 +0100 Subject: [PATCH 329/871] util.MarkDiff ignore_parent=1 sometimes we may not be interested in differences in the topology --- udapi/block/util/markdiff.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/udapi/block/util/markdiff.py b/udapi/block/util/markdiff.py index 3d183f57..6c57ab36 100644 --- a/udapi/block/util/markdiff.py +++ b/udapi/block/util/markdiff.py @@ -9,7 +9,7 @@ class MarkDiff(Block): """Mark differences between parallel trees.""" def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc', - mark=1, add=False, print_stats=0, **kwargs): + mark=1, add=False, print_stats=0, ignore_parent=False, **kwargs): """Create the Mark block object. Params: gold_zone: Which of the zones should be treated as gold? @@ -20,6 +20,7 @@ def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc add: If False, node.misc attributes Mark, ToDo and Bug will be deleted before running this block, so that the marked_only option (e.g. via `udapy -TM`) prints only nodes marked by this block. print_stats: How many lines of statistics should be printed? -1 means all. + ignore_parent: ignore differences in dependency parents """ super().__init__(**kwargs) self.gold_zone = gold_zone @@ -27,6 +28,7 @@ def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc self.mark = mark self.add = add self.print_stats = print_stats + self.ignore_parent = ignore_parent self.stats = collections.Counter() def process_tree(self, tree): @@ -60,7 +62,7 @@ def process_tree(self, tree): edit, pred_lo, pred_hi, gold_lo, gold_hi = diff if edit == 'equal': for p_node, g_node in zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi]): - if alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: p_node.misc['Mark'] = self.mark g_node.misc['Mark'] = self.mark self.stats['ONLY-PARENT-CHANGED'] += 1 @@ -76,7 +78,7 @@ def process_tree(self, tree): p_value, g_value = p_node._get_attr(attr), g_node._get_attr(attr) if p_value != g_value: self.stats[f'{attr.upper()}: {p_value} -> {g_value}'] += 1 - if alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: self.stats['PARENT-CHANGED'] += 1 pred_lo, gold_lo = pred_lo + n, gold_lo + n for node in gold_nodes[gold_lo:gold_hi]: From 2ad4922b5f9fe4196c5b67a00f42f45039f83c3a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 9 Jan 2023 21:09:42 +0100 Subject: [PATCH 330/871] write.TextModeTreesHtml prints zones side by side by default --- udapi/block/write/textmodetreeshtml.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index 9f9f6aa2..7fedc1b8 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -26,7 +26,7 @@ class TextModeTreesHtml(TextModeTrees): This block is a subclass of `TextModeTrees`, see its documentation for more info. """ - def __init__(self, color=True, title='Udapi visualization', **kwargs): + def __init__(self, color=True, title='Udapi visualization', zones_in_rows=True, **kwargs): """Create new TextModeTreesHtml block object. Args: see `TextModeTrees`. @@ -38,6 +38,7 @@ def __init__(self, color=True, title='Udapi visualization', **kwargs): """ super().__init__(color=color, **kwargs) self.title = title + self.zones_in_rows = zones_in_rows def before_process_document(self, document): # TextModeTrees.before_process_document changes the color property, @@ -82,3 +83,15 @@ def print_headers(self, root): print(escape(text)) if self.print_comments and root.comment: print('#' + self.colorize_comment(escape(root.comment)).rstrip().replace('\n', '\n#')) + + def process_bundle(self, bundle): + if self.zones_in_rows: + print("") + for tree in bundle: + if self._should_process_tree(tree): + print("") + print("
    ") + self.process_tree(tree) + print("
    ") + else: + super().process_bundle(bundle) From a49785d844e85771d499b3431cf8d8c9f3878307 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 9 Jan 2023 21:28:09 +0100 Subject: [PATCH 331/871] empty zone does not need a slash in sent_id --- udapi/core/root.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/udapi/core/root.py b/udapi/core/root.py index 6a5717a2..3e6bf62b 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -95,12 +95,13 @@ def zone(self, zone): if self._bundle: self._bundle.check_zone(zone) self._zone = zone + slashzone = '/' + zone if zone else '' if self._bundle is not None: - self._sent_id = self._bundle.address() + '/' + zone + self._sent_id = self._bundle.address() + slashzone elif self._sent_id: - self._sent_id = self._sent_id.split('/', 1)[0] + '/' + zone + self._sent_id = self._sent_id.split('/', 1)[0] + slashzone else: - self._sent_id = '?/' + zone + self._sent_id = '?' + slashzone @property def parent(self): From 5a7ccdc00b7466d1a1469fec9b2a0a63efce1880 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 11 Jan 2023 14:43:05 +0100 Subject: [PATCH 332/871] Case=Ben allowed in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index b286a27c..47437e2a 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -19,7 +19,7 @@ def process_node(self, node): self.check_allowed_features(node, { 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'], + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'], 'Foreign': ['Yes']}) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': @@ -30,7 +30,7 @@ def process_node(self, node): rf = ['PronType', 'Case'] af = { 'PronType': ['Prs', 'Int'], # demonstrative pronouns are treated as third person personal pronouns - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] } if node.feats['PronType'] == 'Prs': af['Reflex'] = ['Yes'] @@ -97,7 +97,7 @@ def process_node(self, node): 'NumType': ['Card'], 'NumForm': ['Word'], 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] }) # VERBS ################################################################ elif node.upos == 'VERB': From e9fe589322d5f6d03d318862bc93ec9eba26bd85 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 18 Jan 2023 12:49:10 +0100 Subject: [PATCH 333/871] Comment: link to the issue where "interleaved" is defined. https://github.com/ufal/corefUD/issues/25 --- udapi/block/corefud/fixinterleaved.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/udapi/block/corefud/fixinterleaved.py b/udapi/block/corefud/fixinterleaved.py index c5a1b3ed..026b19f3 100644 --- a/udapi/block/corefud/fixinterleaved.py +++ b/udapi/block/corefud/fixinterleaved.py @@ -3,7 +3,9 @@ import itertools class FixInterleaved(Block): - """Fix mentions with interleaved or crossing spans.""" + """Fix mentions with interleaved or crossing spans. + https://github.com/ufal/corefUD/issues/25 + """ def __init__(self, same_entity_only=True, both_discontinuous=False, crossing_only=False, nested_same_subspan=True, **kwargs): @@ -58,8 +60,8 @@ def process_tree(self, tree): pass deleted.add(mB) - # By changing the mA.words, we could have create another error: - # making the span same as another mention. Let's fix it + # By changing the mA.words, we could have created another error: + # making the span same as another mention. Let's fix it. sA = set(mA.words) for mC in mentions: if mC in deleted or mC is mA or mC is mB: From 6a9501b6522fca2fe4d38c2fcdf8946170ae69c4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 14:44:30 +0100 Subject: [PATCH 334/871] Updated feature checking for ml. --- udapi/block/ud/ml/markfeatsbugs.py | 68 +++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 47437e2a..54119030 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -19,18 +19,21 @@ def process_node(self, node): self.check_allowed_features(node, { 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'], - 'Foreign': ['Yes']}) + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Foreign': ['Yes'], + 'Typo': ['Yes']}) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': self.check_allowed_features(node, { - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Typo': ['Yes']}) # PRONOUNS ############################################################# elif node.upos == 'PRON': rf = ['PronType', 'Case'] af = { 'PronType': ['Prs', 'Int'], # demonstrative pronouns are treated as third person personal pronouns - 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Typo': ['Yes'] } if node.feats['PronType'] == 'Prs': af['Reflex'] = ['Yes'] @@ -74,13 +77,15 @@ def process_node(self, node): self.check_required_features(node, ['PronType', 'Definite']) self.check_allowed_features(node, { 'PronType': ['Art'], - 'Definite': ['Ind'] + 'Definite': ['Ind'], + 'Typo': ['Yes'] }) else: self.check_required_features(node, ['PronType']) self.check_allowed_features(node, { 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], - 'Deixis': ['Prox', 'Remt'] + 'Deixis': ['Prox', 'Remt'], + 'Typo': ['Yes'] }) # NUMERALS ############################################################# elif node.upos == 'NUM': @@ -89,24 +94,27 @@ def process_node(self, node): if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): self.check_allowed_features(node, { 'NumType': ['Card'], - 'NumForm': ['Digit', 'Roman'] + 'NumForm': ['Digit', 'Roman'], + 'Typo': ['Yes'] }) else: - self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_required_features(node, ['NumType', 'NumForm', 'Case']) self.check_allowed_features(node, { 'NumType': ['Card'], 'NumForm': ['Word'], 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Typo': ['Yes'] }) # VERBS ################################################################ elif node.upos == 'VERB': - self.check_required_features(node, ['VerbForm', 'Voice']) + self.check_required_features(node, ['VerbForm']) if node.feats['VerbForm'] == 'Inf': self.check_allowed_features(node, { 'VerbForm': ['Inf'], 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'] + 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) elif node.feats['VerbForm'] == 'Fin': if node.feats['Mood'] == 'Imp': @@ -121,26 +129,39 @@ def process_node(self, node): 'Mood': ['Imp'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], - 'Polite': ['Infm', 'Form'] + 'Polite': ['Infm', 'Form'], + 'Typo': ['Yes'] + }) + elif node.feats['Mood'] == 'Nec': + self.check_required_features(node, ['Mood', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Nec'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) else: self.check_required_features(node, ['Mood', 'Tense', 'Voice']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Nec'], + 'Mood': ['Ind', 'Pot'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'] + 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) elif node.feats['VerbForm'] == 'Part': - self.check_required_features(node, ['Tense', 'Voice']) + self.check_required_features(node, ['Tense']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Part'], 'Tense': ['Past'], 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'] + 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) else: # verbal noun self.check_required_features(node, ['Tense', 'Voice']) @@ -151,6 +172,7 @@ def process_node(self, node): 'Gender': ['Masc', 'Fem', 'Neut'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) # AUXILIARIES ########################################################## elif node.upos == 'AUX': @@ -161,7 +183,8 @@ def process_node(self, node): 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], 'Mood': ['Imp'], - 'Polarity': ['Pos', 'Neg'] + 'Polarity': ['Pos', 'Neg'], + 'Typo': ['Yes'] }) else: # indicative or subjunctive self.check_required_features(node, ['Mood', 'Tense']) @@ -171,23 +194,26 @@ def process_node(self, node): 'Mood': ['Ind', 'Sub'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'] + 'Typo': ['Yes'] }) # ADVERBS ############################################################## elif node.upos == 'ADV': if node.feats['PronType'] != '': # Pronominal adverbs are neither compared nor negated. self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'Typo': ['Yes'] }) else: # The remaining adverbs are neither pronominal, nor compared or # negated. - self.check_allowed_features(node, {}) + self.check_allowed_features(node, {'Typo': ['Yes']}) # PARTICLES ############################################################ elif node.upos == 'PART': self.check_allowed_features(node, { - 'Polarity': ['Neg'] + 'Polarity': ['Neg'], + 'Typo': ['Yes'] }) # THE REST: NO FEATURES ################################################ else: - self.check_allowed_features(node, {}) + self.check_allowed_features(node, {'Typo': ['Yes']}) From 448bba23b9aa90f8741efcd7565a516a7c84c85b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 14:45:28 +0100 Subject: [PATCH 335/871] bug fix --- udapi/block/ud/ml/markfeatsbugs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 54119030..4741d2fa 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -193,7 +193,7 @@ def process_node(self, node): 'VerbForm': ['Fin'], 'Mood': ['Ind', 'Sub'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative - 'Polarity': ['Pos', 'Neg'] + 'Polarity': ['Pos', 'Neg'], 'Typo': ['Yes'] }) # ADVERBS ############################################################## From 7524bd5cdbe88661eb09eb46f88bc3de07f5716e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 14:59:48 +0100 Subject: [PATCH 336/871] Updated feature checking for ml. --- udapi/block/ud/ml/markfeatsbugs.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 4741d2fa..be084e22 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -25,6 +25,7 @@ def process_node(self, node): # ADJECTIVES ########################################################### elif node.upos == 'ADJ': self.check_allowed_features(node, { + 'VerbForm': ['Part'], 'Foreign': ['Yes'], 'Typo': ['Yes']}) # PRONOUNS ############################################################# @@ -66,9 +67,9 @@ def process_node(self, node): # ആരെ āre "who" (Acc) എന്തെ ente "what" (Acc.Anim) എന്തിനെ entine "what" (Acc.Anim or maybe Inan but optional) # ആരുടെ āruṭe "who" (Gen) എന്തിന് entin "what" (Gen) or "why" # ആരൊക്കെ ārokke "who" (Dat?) എന്തൊക്കെ entokke "what" (Dat?) - elif node.feats['PronType'] == 'Int': - rf.append('Animacy') - af['Animacy'] = ['Anim', 'Inan'] + #elif node.feats['PronType'] == 'Int': + # rf.append('Animacy') + # af['Animacy'] = ['Anim', 'Inan'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # DETERMINERS ########################################################## @@ -122,13 +123,12 @@ def process_node(self, node): # The verb stem serves as an informal imperative: തുറ tuṟa "open" # The citation form may serve as a formal imperative: തുറക്കുക tuṟakkūka "open" # Finally, there is another formal imperative with -kkū: തുറക്കൂ tuṟakkū "open" - self.check_required_features(node, ['Mood', 'Voice', 'Polite']) + self.check_required_features(node, ['Mood', 'Polite']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], 'Mood': ['Imp'], 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'], 'Polite': ['Infm', 'Form'], 'Typo': ['Yes'] }) @@ -164,7 +164,9 @@ def process_node(self, node): 'Typo': ['Yes'] }) else: # verbal noun - self.check_required_features(node, ['Tense', 'Voice']) + # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice. + # Currently both forms are VerbForm=Vnoun. + #self.check_required_features(node, ['Tense', 'Voice']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Vnoun'], From 0c0e0a257896741295c27661397e5d263aa8d1dc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 15:12:49 +0100 Subject: [PATCH 337/871] AUX allows Vnoun. --- udapi/block/ud/ml/markfeatsbugs.py | 45 ++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index be084e22..4f17c45f 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -174,28 +174,45 @@ def process_node(self, node): 'Gender': ['Masc', 'Fem', 'Neut'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], 'Typo': ['Yes'] }) # AUXILIARIES ########################################################## elif node.upos == 'AUX': self.check_required_features(node, ['VerbForm']) - if node.feats['Mood'] == 'Imp': - self.check_required_features(node, ['Mood']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prog'], - 'VerbForm': ['Fin'], - 'Mood': ['Imp'], - 'Polarity': ['Pos', 'Neg'], - 'Typo': ['Yes'] - }) - else: # indicative or subjunctive - self.check_required_features(node, ['Mood', 'Tense']) + if node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'], + 'Typo': ['Yes'] + }) + else: # indicative or subjunctive + self.check_required_features(node, ['Mood', 'Tense']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Sub'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'], + 'Typo': ['Yes'] + }) + else: # verbal noun + # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice. + # Currently both forms are VerbForm=Vnoun. + #self.check_required_features(node, ['Tense', 'Voice']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], - 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Sub'], - 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Gender': ['Masc', 'Fem', 'Neut'], 'Polarity': ['Pos', 'Neg'], + # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], 'Typo': ['Yes'] }) # ADVERBS ############################################################## From 94e7e85033515b101873c58e16a97dcd7b465dd9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 15:15:41 +0100 Subject: [PATCH 338/871] Foreign VERB --- udapi/block/ud/ml/markfeatsbugs.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 4f17c45f..2cb4f791 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -115,6 +115,7 @@ def process_node(self, node): 'VerbForm': ['Inf'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) elif node.feats['VerbForm'] == 'Fin': @@ -130,6 +131,7 @@ def process_node(self, node): 'Mood': ['Imp'], 'Polarity': ['Pos', 'Neg'], 'Polite': ['Infm', 'Form'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) elif node.feats['Mood'] == 'Nec': @@ -140,6 +142,7 @@ def process_node(self, node): 'Mood': ['Nec'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) else: @@ -151,6 +154,7 @@ def process_node(self, node): 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) elif node.feats['VerbForm'] == 'Part': @@ -161,6 +165,7 @@ def process_node(self, node): 'Tense': ['Past'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) else: # verbal noun @@ -176,6 +181,7 @@ def process_node(self, node): 'Voice': ['Act', 'Pass', 'Cau'], # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) # AUXILIARIES ########################################################## From e79bd16052f39cad08782315887df7849177ce3d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 22:46:50 +0100 Subject: [PATCH 339/871] Conditional in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 2cb4f791..75552c36 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -32,7 +32,7 @@ def process_node(self, node): elif node.upos == 'PRON': rf = ['PronType', 'Case'] af = { - 'PronType': ['Prs', 'Int'], # demonstrative pronouns are treated as third person personal pronouns + 'PronType': ['Prs', 'Int', 'Ind'], # demonstrative pronouns are treated as third person personal pronouns 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], 'Typo': ['Yes'] } @@ -150,7 +150,7 @@ def process_node(self, node): self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Pot'], + 'Mood': ['Ind', 'Pot', 'Cnd'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], From 337e7f6d159cf68bacb88529ea843c6c8b67a18d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 26 Jan 2023 00:05:13 +0100 Subject: [PATCH 340/871] Conditional in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 75552c36..5ca2b4fb 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -202,7 +202,7 @@ def process_node(self, node): self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Sub'], + 'Mood': ['Ind', 'Sub', 'Cnd'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'], 'Typo': ['Yes'] From b6600ea65e001d76ffbec656382384d60511d76c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 26 Jan 2023 00:07:40 +0100 Subject: [PATCH 341/871] Don't print empty tables if no trees will be printed in a given bundle Fixes #110 --- udapi/block/write/textmodetreeshtml.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index 7fedc1b8..5ccceb78 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -86,12 +86,22 @@ def print_headers(self, root): def process_bundle(self, bundle): if self.zones_in_rows: - print("") + # Don't print
    if no tree will be printed in this bundle. + marked_trees = [] for tree in bundle: if self._should_process_tree(tree): + if self.print_empty: + allnodes = [tree] + tree.descendants_and_empty + else: + allnodes = tree.descendants(add_self=1) + if self.should_print_tree(tree, allnodes): + marked_trees.append(tree) + if marked_trees: + print("") + for tree in marked_trees: print("") - print("
    ") self.process_tree(tree) print("
    ") + print("") else: super().process_bundle(bundle) From b8b68bf6474751dbf5ec7205ea40936c19c5aa73 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 26 Jan 2023 09:56:25 +0100 Subject: [PATCH 342/871] Do not check foreign words for Malayalam features. --- udapi/block/ud/ml/markfeatsbugs.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 5ca2b4fb..12e2ef0f 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -13,8 +13,17 @@ class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): def process_node(self, node): + # FOREIGN WORDS ######################################################## + # Do not put any restrictions on words that have Foreign=Yes. These may + # also have Lang=xx in MISC, which would mean that the official + # validator would judge them by the rules for language [xx]. But even + # if they are not fully code-switched (e.g. because they are written in + # the Malayalam script, like the English verb പ്ലാന്റ് plānṟ "plant"), + # they still may not have the regular features of Malayalam morphology. + if node.feats['Foreign'] == 'Yes': + pass # NOUNS AND PROPER NOUNS ############################################### - if re.match(r'^(NOUN|PROPN)$', node.upos): + elif re.match(r'^(NOUN|PROPN)$', node.upos): self.check_required_features(node, ['Animacy', 'Number', 'Case']) self.check_allowed_features(node, { 'Animacy': ['Anim', 'Inan'], From 1335522492d7c6cc528ab576dfb3142d4aac67e3 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 26 Jan 2023 21:59:37 +0100 Subject: [PATCH 343/871] improve definition of almost_forest in PrintMentions --- udapi/block/corefud/printmentions.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 7ed31b0d..12db433a 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -10,7 +10,7 @@ class PrintMentions(Block): def __init__(self, continuous='include', almost_continuous='include', treelet='include', forest='include', almost_forest='include', oneword='include', singleton='include', empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, - print_total=True, + print_total=True, print_should=True, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color=True, attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, @@ -33,6 +33,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i random.seed(42) self.print_other_forms = print_other_forms self.print_total = print_total, + self.print_should = print_should, print_class = TextModeTreesHtml if html else TextModeTrees self.print_block = print_class( print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, @@ -61,7 +62,9 @@ def _ok(self, condition, value): return (condition and value == 'only') or (not condition and value=='exclude') def _is_auxiliary_etc(self, node): - if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'vocative'}: + if node.udeprel in {'case', 'cc', 'conj', 'mark', 'appos', 'vocative', 'discourse'}: + return True + if node.deprel == 'advmod:emph': return True if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}: return True @@ -79,8 +82,25 @@ def _is_forest(self, mention, mwords, almost): for ch in w.children: if ch not in mwords: if not almost: + if self.print_should: + ch.misc["ShouldBeInSpanOf"] = mention.entity.eid return False + # Punctuation before or after the mention span can depend on any of the mwords + # without breaking the almost_forest property. + # According to the UD guidelines, it should depend on the highest node within the phrase, + # i.e. on the mention head, but it is not our goal now to check UD punctuation guidelines. + if ch.udeprel == 'punct' and (ch < mention.words[0] or ch > mention.words[-1]): + continue + # Some auxiliary words (e.g. prepositions) may be excluded from the mention span + # without breaking the almost_forest property, but they need to depend + # on the mention head (or if the mention is not a catena, they need to depend + # on one of the potential heads, i.e. a node from mwords whose parent is not in mwords). + # For example: "A gift for (e1 John)" is almost_forest ("for" depends on "John" which is the mention head), + # but "(e1[1/2] John) with (e1[2/2]) Mary" is not almost_forest + # because "with" depends on "Mary", which is not the mention head (nor a potential mention head). if not (w.parent and w.parent not in mwords and self._is_auxiliary_etc(ch)): + if self.print_should: + ch.misc["ShouldBeInSpanOf"] = mention.entity.eid return False return True From 0178372e381accb9c28795bcfff5f21366e48520 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 27 Jan 2023 22:49:12 +0100 Subject: [PATCH 344/871] Malayalam adpositions can have the Case feature. --- udapi/block/ud/ml/markfeatsbugs.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 12e2ef0f..c2a8e0f4 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -242,6 +242,13 @@ def process_node(self, node): # The remaining adverbs are neither pronominal, nor compared or # negated. self.check_allowed_features(node, {'Typo': ['Yes']}) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_allowed_features(node, { + # Case suffixes after numbers are separate tokens, they are attached + # via the 'case' relation and they bear the Case feature (the number does not). + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Typo': ['Yes']}) # PARTICLES ############################################################ elif node.upos == 'PART': self.check_allowed_features(node, { From c3da386bf36609774e34464899a048700631b4b9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 Jan 2023 11:08:43 +0100 Subject: [PATCH 345/871] ud.SetTranslation (e.g. lines from Google Translate) --- udapi/block/ud/settranslation.py | 59 ++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 udapi/block/ud/settranslation.py diff --git a/udapi/block/ud/settranslation.py b/udapi/block/ud/settranslation.py new file mode 100644 index 00000000..487cca06 --- /dev/null +++ b/udapi/block/ud/settranslation.py @@ -0,0 +1,59 @@ +""" +Block SetTranslation for setting of sentence-level translation (the attribute +text_en for English translation) from a separate text file (one sentence per +line). For example, one can export the original sentences using write.SentencesHtml, +then Google-translate them in the web browser, then CTRL+C CTRL+V to a plain +text editor, save them as translations.txt and import them using this block. + +Usage: +udapy -s ud.SetTranslation file=translations.txt < in.conllu > out.conllu + +Author: Dan Zeman +""" +from udapi.core.block import Block +import re +import logging + +class SetTranslation(Block): + """ + Set text_en to the next available translation. + """ + + def __init__(self, file, overwrite=False, **kwargs): + """ + Create the SetTranslation block. + + Parameters: + file: the name of the text file with the translations (one sentence per line) + overwrite=1: set the translation even if the sentence already has one + (default: do not overwrite existing translations) + """ + super().__init__(**kwargs) + self.file = file + fh = open(self.file, 'r', encoding='utf-8') + self.trlines = fh.readlines() + self.nlines = len(self.trlines) + self.iline = 0 + self.overwrite = overwrite + + def process_tree(self, tree): + if self.iline < self.nlines: + translation = self.trlines[self.iline] + self.iline += 1 + comments = [] + if tree.comment: + comments = tree.comment.split('\n') + i_tr = -1 + for i in range(len(comments)): + # The initial '#' character has been stripped. + if re.match(r'\s*text_en\s*=', comments[i]): + i_tr = i + break + if i_tr >= 0: + if self.overwrite: + comments[i_tr] = ' text_en = ' + translation + else: + comments.append(' text_en = ' + translation) + tree.comment = '\n'.join(comments) + elif self.iline == self.nlines: + logging.warning('There are only %d translation lines but there are more input sentences.' % self.nlines) From a75ab8d8bd9754b776911c41977fbcacdcf3b521 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 00:52:29 +0100 Subject: [PATCH 346/871] first draft of a coreference-visualization writer --- udapi/block/write/corefhtml.py | 123 +++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 udapi/block/write/corefhtml.py diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py new file mode 100644 index 00000000..fc49dfb4 --- /dev/null +++ b/udapi/block/write/corefhtml.py @@ -0,0 +1,123 @@ +"""CorefHtml class is a writer for HTML+JavaScript visualization of coreference.""" +from udapi.core.basewriter import BaseWriter +from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention + +ETYPES = 'person place organization animal plant object substance time number abstract event'.split() + +class CorefHtml(BaseWriter): + + def __init__(self, path_to_js='web', **kwargs): + super().__init__(**kwargs) + self.path_to_js = path_to_js + + def process_document(self, doc): + print('') + print('Udapi CorefUD viewer') + print('') + #print('') #$(window).on("load", function() {...} + #print('') + print('') + print('\n') + + for tree in doc.trees: + self.process_tree(tree) + + print('') + print('') + + def process_tree(self, tree): + mentions = set() + nodes_and_empty = tree.descendants_and_empty + for node in nodes_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + sent_mentions = [] + for mention in mentions: + mspan = mention.span + if ',' not in mspan: + sent_mentions.append(mention) + else: + entity = mention.entity + head_str = str(mention.words.index(mention.head) + 1) + subspans = mspan.split(',') + for idx,subspan in enumerate(subspans, 1): + subspan_eid = f'{entity.eid}[{idx}/{len(subspans)}]' + subspan_words = span_to_nodes(tree, subspan) + fake_entity = CorefEntity(subspan_eid, entity.etype) + fake_mention = CorefMention(subspan_words, head_str, fake_entity, add_word_backlinks=False) + if mention._other: + fake_mention._other = mention._other + if mention._bridging and idx == 1: + fake_mention._bridging = mention._bridging + sent_mentions.append(fake_mention) + sent_mentions.sort(reverse=True) + + opened = [] + print('

    ') + for node in nodes_and_empty: + while sent_mentions and sent_mentions[-1].words[0] == node: + m = sent_mentions.pop() + e = m.entity + classes = f'{e.eid} {e.etype or "other"}' + if all(w.is_empty() for w in m.words): + classes += ' empty' + if len(e.mentions) == 1: + classes += ' singleton' + title = f'eid={e.eid}\ntype={e.etype}\nhead={m.head.form}' + print(f'', end='') + opened.append(m) + + is_head = self._is_head(node) + if is_head: + print('', end='') + if node.is_empty(): + print('', end='') + print(node.form, end='') + if node.is_empty(): + print('', end='') + if is_head: + print('', end='') + + while opened and opened[-1].words[-1] == node: + print('', end='') + opened.pop() + + if not node.no_space_after: + print(' ', end='') + + print('

    ') + + def _is_head(self, node): + for mention in node.coref_mentions: + if mention.head == node: + return mention + return None + +# id needs to be a valid DOM querySelector +# so it cannot contain # nor / and it cannot start with a digit +def _id(node): + if node is None: + return 'null' + return '"n%s"' % node.address().replace('#', '-').replace('/', '-') + + +def _esc(string): + if string is None: + string = '' + return string.replace('\\', '\\\\').replace('"', r'\"') From e3ae1c3fb65fa62431e23c2bfff9d8534d458019 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 13:25:49 +0100 Subject: [PATCH 347/871] fix visualization of discontinuous mentions introduce CorefMentionSubspan instead of fake mentions (should be used also in store_coref_to_misc() in future) --- udapi/block/write/corefhtml.py | 40 +++++++++++----------------------- udapi/core/coref.py | 39 ++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 28 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index fc49dfb4..890b172a 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -44,44 +44,30 @@ def process_tree(self, tree): for m in node.coref_mentions: mentions.add(m) - sent_mentions = [] + subspans = [] for mention in mentions: - mspan = mention.span - if ',' not in mspan: - sent_mentions.append(mention) - else: - entity = mention.entity - head_str = str(mention.words.index(mention.head) + 1) - subspans = mspan.split(',') - for idx,subspan in enumerate(subspans, 1): - subspan_eid = f'{entity.eid}[{idx}/{len(subspans)}]' - subspan_words = span_to_nodes(tree, subspan) - fake_entity = CorefEntity(subspan_eid, entity.etype) - fake_mention = CorefMention(subspan_words, head_str, fake_entity, add_word_backlinks=False) - if mention._other: - fake_mention._other = mention._other - if mention._bridging and idx == 1: - fake_mention._bridging = mention._bridging - sent_mentions.append(fake_mention) - sent_mentions.sort(reverse=True) + subspans.extend(mention._subspans()) + subspans.sort(reverse=True) opened = [] print('

    ') for node in nodes_and_empty: - while sent_mentions and sent_mentions[-1].words[0] == node: - m = sent_mentions.pop() + while subspans and subspans[-1].words[0] == node: + subspan = subspans.pop() + m = subspan.mention e = m.entity classes = f'{e.eid} {e.etype or "other"}' - if all(w.is_empty() for w in m.words): + if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: classes += ' singleton' - title = f'eid={e.eid}\ntype={e.etype}\nhead={m.head.form}' - print(f'', end='') - opened.append(m) + title += f'\n{m.other}' + print(f'', end='') #data-eid="{e.eid}" + + opened.append(subspan) is_head = self._is_head(node) if is_head: diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 3eb76db3..1a6d1f95 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -128,6 +128,17 @@ def __init__(self, words, head=None, entity=None, add_word_backlinks=True): new_word._mentions.append(self) new_word._mentions.sort() + def _subspans(self): + mspan = self.span + if ',' not in mspan: + return [CorefMentionSubspan(self._words, self, '')] + root = self._words[0].root + subspans = mspan.split(',') + result = [] + for idx,subspan in enumerate(subspans, 1): + result.append(CorefMentionSubspan(span_to_nodes(root, subspan), self, f'[{idx}/{len(subspans)}]')) + return result + def __lt__(self, another): """Does this mention precedes (word-order wise) `another` mention? @@ -247,6 +258,32 @@ def span(self, new_span): self.words = span_to_nodes(self._head.root, new_span) +@functools.total_ordering +class CorefMentionSubspan(object): + """Helper class for representing a continuous subspan of a mention.""" + __slots__ = ['words', 'mention', 'subspan_id'] + + def __init__(self, words, mention, subspan_id): + if not words: + raise ValueError("mention.words must be non-empty") + self.words = sorted(words) + self.mention = mention + self.subspan_id = subspan_id + + def __lt__(self, another): + if self.words[0] is another.words[0]: + if len(self.words) > len(another.words): + return True + if len(self.words) < len(another.words): + return False + assert False + return self.words[0].precedes(another.words[0]) + + @property + def subspan_eid(self): + return self.mention._entity.eid + self.subspan_id + + CHARS_FORBIDDEN_IN_ID = "-=| \t()" @@ -886,7 +923,7 @@ def nodes_to_span(nodes): Note that empty nodes may form gaps in the span, so if a given tree contains an empty node with ord 5.1, but only nodes with ords 3, 4, 5, 6, 7.1 and 7.2 are provided as `nodes`, the resulting string will be "3-5,6,7.1-7.2". - This means that the implementation needs to iterate of all nodes + This means that the implementation needs to iterate over all nodes in a given tree (root.descendants_and_empty) to check for such gaps. """ if not nodes: From b78ef7eea0b76c4f41f8408d918092681d9c5fad Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 14:16:46 +0100 Subject: [PATCH 348/871] util.Normalize: sort attributes in FEATS and MISC --- udapi/block/util/normalize.py | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 udapi/block/util/normalize.py diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py new file mode 100644 index 00000000..5b4270cc --- /dev/null +++ b/udapi/block/util/normalize.py @@ -0,0 +1,40 @@ +"""util.Normalize normalizes the ordering of various attributes in CoNLL-U.""" +from udapi.core.block import Block + +class Normalize(Block): + """Normalize the ordering of attributes in the FEATS and MISC columns. + + The attribute-value pairs in the FEATS column in CoNLL-U files + must be sorted alphabetically (case-insensitive) according to the guidelines + (https://universaldependencies.org/format.html#morphological-annotation). + The same is highly recommended for the MISC column. + It is useful e.g. for comparing two conllu files with diff. + + Udapi does the sorting automatically, but for speed reasons + only when writing into these attributes. + This block thus just forces deserialization of node.feats and node.misc, + so that the Udapi later sorts the attributes during serialization. + It is a bit more efficient than something like + util.Eval node='node.feats["Number"] = node.feats["Number"]' + or + util.Eval node='node.misc["NonExistentAttribute"] = None' + """ + + def __init__(self, feats=True, misc=True, **kwargs): + """ + Args: + `feats`: normalize the ordering of FEATS. Default=True. + `misc`: normalize the ordering of MISC. Default=True. + """ + super().__init__(**kwargs) + self.feats = feats + self.misc = misc + # TODO: normalize also standardized comments like text, sent_id,... + + def process_node(self, node): + if self.feats: + node.feats._deserialize_if_empty() + node.feats._string = None + if self.misc: + node.misc._deserialize_if_empty() + node.misc._string = None From 90f338de077467acb4cb9ebebce68179419a0d77 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 14:29:30 +0100 Subject: [PATCH 349/871] allow writing to node.sdeprel, add tests --- udapi/core/node.py | 8 ++++++++ udapi/core/tests/test_node.py | 25 ++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 63242698..e188e134 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -166,6 +166,14 @@ def sdeprel(self): return parts[1] return '' + @sdeprel.setter + def sdeprel(self, value): + udeprel = self.udeprel + if value is not None and value != '': + self.deprel = udeprel + ':' + value + else: + self.deprel = udeprel + @property def feats(self): """Property for morphological features stored as a `Feats` object. diff --git a/udapi/core/tests/test_node.py b/udapi/core/tests/test_node.py index 28a45d85..8bc7f182 100755 --- a/udapi/core/tests/test_node.py +++ b/udapi/core/tests/test_node.py @@ -119,7 +119,7 @@ def test_draw(self): sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type def test_feats(self): - """Test the morphological featrues.""" + """Test the morphological features.""" node = Node(root=None) self.assertEqual(str(node.feats), '_') node.feats = '' @@ -145,6 +145,29 @@ def test_feats(self): self.assertEqual(str(node.feats), '_') self.assertEqual(node.feats, {}) + def test_deprel(self): + """Test getting setting the dependency relation.""" + node = Node(root=None, deprel='acl:relcl') + self.assertEqual(node.deprel, 'acl:relcl') + self.assertEqual(node.udeprel, 'acl') + self.assertEqual(node.sdeprel, 'relcl') + node.udeprel = 'advcl' + self.assertEqual(node.deprel, 'advcl:relcl') + node.sdeprel = 'tcl' + self.assertEqual(node.deprel, 'advcl:tcl') + node.sdeprel = '' + self.assertEqual(node.deprel, 'advcl') + self.assertEqual(node.udeprel, 'advcl') + self.assertEqual(node.sdeprel, '') + node.udeprel = 'nsubj' + self.assertEqual(node.deprel, 'nsubj') + self.assertEqual(node.udeprel, 'nsubj') + self.assertEqual(node.sdeprel, '') + node.udeprel = 'nsubj:pass:outer' + self.assertEqual(node.deprel, 'nsubj:pass:outer') + self.assertEqual(node.udeprel, 'nsubj') + self.assertEqual(node.sdeprel, 'pass:outer') + def test_deps_getter(self): """Test enhanced dependencies.""" # Create a path to the test CoNLLU file. From 5817af214df034e42cf09ef2c08f0c8d15b3a0d9 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 16:31:50 +0100 Subject: [PATCH 350/871] write.CorefHtml marks subspans of discontiuous mentions with a red border --- udapi/block/write/corefhtml.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 890b172a..e0ab830b 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -18,26 +18,34 @@ def process_document(self, doc): #print('') print('') print('\n') + mention_ids = {} + for entity in doc.coref_entities: + for idx, mention in enumerate(entity.mentions, 1): + mention_ids[mention] = f'{entity.eid}e{idx}' + for tree in doc.trees: - self.process_tree(tree) + self.process_tree(tree, mention_ids) print('') + ' e.stopPropagation();\n});\n' + '$("span").hover(\n' + ' function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");},\n' + ' function(e) {$("span").removeClass("active");}\n' + ');\n') print('') - def process_tree(self, tree): + def process_tree(self, tree, mention_ids): mentions = set() nodes_and_empty = tree.descendants_and_empty for node in nodes_and_empty: @@ -56,7 +64,7 @@ def process_tree(self, tree): subspan = subspans.pop() m = subspan.mention e = m.entity - classes = f'{e.eid} {e.etype or "other"}' + classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"}' if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: From 355e7bdc32ab854827aff1f7277b069f5c5a8bc0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 17:57:48 +0100 Subject: [PATCH 351/871] write.CorefHtml shows also crossing mentions using valid (well-nested) html --- udapi/block/write/corefhtml.py | 56 +++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index e0ab830b..3efe9793 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -18,7 +18,8 @@ def process_document(self, doc): #print('') print('') @@ -35,15 +74,37 @@ def process_document(self, doc): for tree in doc.trees: self.process_tree(tree, mention_ids) - print('') + print('') print('') def _start_subspan(self, subspan, mention_ids, crossing=False): @@ -74,8 +135,10 @@ def process_tree(self, tree, mention_ids): subspans.extend(mention._subspans()) subspans.sort(reverse=True) + if tree.newpar: + print('


    ') opened = [] - print('

    ') + print(f'

    ') for node in nodes_and_empty: while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() From 9e11bd515e19fa59c0bdbc50654d29544b13a21b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 1 Feb 2023 18:03:19 +0100 Subject: [PATCH 358/871] util.Normalize now normalizes also sent_id --- udapi/block/util/normalize.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py index 5b4270cc..298bea42 100644 --- a/udapi/block/util/normalize.py +++ b/udapi/block/util/normalize.py @@ -20,16 +20,33 @@ class Normalize(Block): util.Eval node='node.misc["NonExistentAttribute"] = None' """ - def __init__(self, feats=True, misc=True, **kwargs): + def __init__(self, feats=True, misc=True, sent_id=True, start_sent_id=1, **kwargs): """ Args: `feats`: normalize the ordering of FEATS. Default=True. `misc`: normalize the ordering of MISC. Default=True. + `sent_id`: normalize sent_id so it forms a sequence of integers + `start_sent_id`: the first sent_id number """ super().__init__(**kwargs) self.feats = feats self.misc = misc - # TODO: normalize also standardized comments like text, sent_id,... + self.sent_id = sent_id + self.next_sent_id = start_sent_id + # TODO: normalize also the order of standardized comments like text, sent_id,... + + def process_bundle(self, bundle): + if self.sent_id: + bundle.bundle_id = str(self.next_sent_id) + self.next_sent_id += 1 + + for tree in bundle: + if self._should_process_tree(tree): + self.process_tree(tree) + + def process_tree(self, tree): + for node in tree.descendants: + self.process_node(node) def process_node(self, node): if self.feats: From 4e1b75678dab1f2602cc26b641a31de977a98f14 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 00:47:37 +0100 Subject: [PATCH 359/871] sent_id should not be normalized by default Unlike feats and misc ordering, we can lose information this way - the original sent_id, so it is potentially dangerous. --- udapi/block/util/normalize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py index 298bea42..48cd6dc1 100644 --- a/udapi/block/util/normalize.py +++ b/udapi/block/util/normalize.py @@ -20,12 +20,12 @@ class Normalize(Block): util.Eval node='node.misc["NonExistentAttribute"] = None' """ - def __init__(self, feats=True, misc=True, sent_id=True, start_sent_id=1, **kwargs): + def __init__(self, feats=True, misc=True, sent_id=False, start_sent_id=1, **kwargs): """ Args: `feats`: normalize the ordering of FEATS. Default=True. `misc`: normalize the ordering of MISC. Default=True. - `sent_id`: normalize sent_id so it forms a sequence of integers + `sent_id`: normalize sent_id so it forms a sequence of integers. Default=False. `start_sent_id`: the first sent_id number """ super().__init__(**kwargs) From b899af14c12c7ba4c9750ba39bf5f5544783ba59 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 09:53:59 +0100 Subject: [PATCH 360/871] write.Conllu path=another/directory keeps the file name, but changes the directory --- udapi/core/basewriter.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index cdc2c38f..93f6463a 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -1,6 +1,7 @@ """BaseWriter is the base class for all writer blocks.""" import sys import logging +import os import udapi.core.coref from udapi.core.block import Block @@ -11,7 +12,7 @@ class BaseWriter(Block): """Base class for all reader blocks.""" def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding='utf-8', - newline='\n', overwrite=False, **kwargs): + newline='\n', overwrite=False, path=None, **kwargs): super().__init__(**kwargs) self.orig_files = files self.orig_stdout = sys.stdout @@ -29,6 +30,7 @@ def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding=' raise ValueError("overwrite=1 is not compatible with files=" + files) if overwrite and docname_as_file: raise ValueError("overwrite=1 is not compatible with docname_as_file=1") + self.path = path @property def filename(self): @@ -60,9 +62,11 @@ def before_process_document(self, document): sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: logging.warning('docname_as_file=1 but the document contains no docname') - elif self.overwrite: + elif self.overwrite or self.path: docname = document.meta.get('loaded_from', None) if docname is not None: + if self.path: + docname = os.path.join(self.path, os.path.split(docname)[1]) logging.info('Writing to file %s.', docname) sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: From 9d183c1d979c50fabff9b3a295a0d8194a09c790 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 10:14:59 +0100 Subject: [PATCH 361/871] etype mismatch is stored in mention.other["orig_etype"] which allows easier debugging --- udapi/core/coref.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 1a13d9fb..12dda239 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -665,6 +665,7 @@ def load_coref_from_misc(doc, strict=True): entity.etype = etype elif etype and entity.etype and entity.etype != etype: logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}") + other["orig_etype"] = etype # CorefEntity could be created first with "Bridge=" without any type elif etype and entity.etype is None: entity.etype = etype From 5b3ed0268ccf76f5332fcce87ac0da9a42b221b8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 14:19:33 +0100 Subject: [PATCH 362/871] allow using e.g. write.CorefHtml path='html/*.html' --- udapi/core/basewriter.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index 93f6463a..e17a64c3 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -66,11 +66,21 @@ def before_process_document(self, document): docname = document.meta.get('loaded_from', None) if docname is not None: if self.path: - docname = os.path.join(self.path, os.path.split(docname)[1]) + old_dir, old_filename = os.path.split(docname) + new_dir, new_filename = os.path.split(self.path) + old_file, old_ext = os.path.splitext(old_filename) + new_file, new_ext = os.path.splitext(new_filename) + if new_dir in ('', '*'): + new_dir = old_dir + if new_file in ('', '*'): + new_file = old_file + if new_ext in ('', '*'): + new_ext = old_ext + docname = os.path.join(new_dir, new_file + new_ext) logging.info('Writing to file %s.', docname) sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: - logging.warning('overwrite=1 but document.meta["loaded_from"] is None') + logging.warning('using overwrite or path but document.meta["loaded_from"] is None') else: sys.stdout = self.orig_stdout else: From 34aa19d7d892790b81b2b79579fc4391c07a23ed Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 14:42:30 +0100 Subject: [PATCH 363/871] write.Conllu path=my_dir should be interpreted as path=my_dir/ --- udapi/core/basewriter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index e17a64c3..6e1b7446 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -30,6 +30,9 @@ def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding=' raise ValueError("overwrite=1 is not compatible with files=" + files) if overwrite and docname_as_file: raise ValueError("overwrite=1 is not compatible with docname_as_file=1") + # interpret path=my_dir/my_subdir as path=my_dir/my_subdir/ + if path and path[-1] != os.sep and '*' not in path: + path += os.sep self.path = path @property From 301b808082254a9b45a2bd4cfe162719dc02bc23 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 4 Feb 2023 01:36:25 +0100 Subject: [PATCH 364/871] corefud.GuessSpan: add empty nodes that are causing gaps --- udapi/block/corefud/guessspan.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/guessspan.py b/udapi/block/corefud/guessspan.py index 5c3c6c12..d6093ece 100644 --- a/udapi/block/corefud/guessspan.py +++ b/udapi/block/corefud/guessspan.py @@ -4,6 +4,30 @@ class GuessSpan(Block): """Block corefud.GuessSpan heuristically fills mention spans, while keeping mention.head""" def process_coref_mention(self, mention): - mention.words = mention.head.descendants(add_self=True) - # TODO add empty nodes that are causing gaps + mwords = mention.head.descendants(add_self=True) # TODO add heuristics from corefud.PrintMentions almost_forest=1 + + # Add empty nodes that are causing gaps. + # A node "within the span" whose enhanced parent is in the mentions + # must be added to the mention as well. + # "within the span" includes also empty nodes "on the boundary". + # However, don't add empty nodes which are in a gap cause by non-empty nodes. + to_add = [] + min_ord = int(mwords[0].ord) if mwords[0].is_empty() else mwords[0].ord - 1 + max_ord = int(mwords[-1].ord) + 1 + root = mention.head.root + for empty in root.empty_nodes: + if empty in mwords: + continue + if empty.ord > max_ord: + break + if empty.ord > min_ord: + if any(enh['parent'] in mwords for enh in empty.deps): + to_add.append(empty) + elif empty.ord > min_ord + 1 and empty.ord < max_ord - 1: + prev_nonempty = root.descendants[int(empty.ord) - 1] + next_nonempty = root.descendants[int(empty.ord)] + if prev_nonempty in mwords and next_nonempty in mwords: + to_add.append(empty) + #else: empty.misc['Mark'] = f'not_in_treelet_of_{mention.entity.eid}' + mention.words = sorted(mwords + to_add) From 2285d27f5e9444d3db7a8a0b8db227b38e5c082b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 5 Feb 2023 01:06:32 +0100 Subject: [PATCH 365/871] write.CorefHtml: distinguish entities using colors, show eid and docname --- udapi/block/write/corefhtml.py | 41 +++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 8503854f..0a06b7e5 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -1,19 +1,21 @@ """CorefHtml class is a writer for HTML+JavaScript visualization of coreference.""" from udapi.core.basewriter import BaseWriter from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention +from collections import Counter import udapi.block.write.html ETYPES = 'person place organization animal plant object substance time number abstract event'.split() CSS = ''' .sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.sentence span .eid {display:block; font-size: 10px;} .showtree {float:left; margin: 5px;} .close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} .empty {color: gray;} -.singleton {border-style: dotted;} +.sentence .singleton {border-style: dotted;} .crossing:before {content: "!"; display: block; background: #ffd500;} .active {border: 1px solid red !important;} -.selected {background: red !important;} +.selected {background: red !important; text-shadow: 1px 1px 4px white, -1px 1px 4px white, 1px -1px 4px white, -1px -1px 4px white;} .other {background: hsl(0, 0%, 85%);} ''' @@ -50,9 +52,11 @@ class CorefHtml(BaseWriter): - def __init__(self, show_trees=True, **kwargs): + def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): super().__init__(**kwargs) self.show_trees = show_trees + self.show_eid = show_eid + self.colors = colors def process_document(self, doc): print('') @@ -63,16 +67,25 @@ def process_document(self, doc): print('') print('\n') mention_ids = {} + entity_colors = {} + entities_of_type = Counter() for entity in doc.coref_entities: + if self.colors: + count = entities_of_type[entity.etype] + entities_of_type[entity.etype] = count + 1 + entity_colors[entity] = f'c{count % self.colors}' for idx, mention in enumerate(entity.mentions, 1): mention_ids[mention] = f'{entity.eid}e{idx}' for tree in doc.trees: - self.process_tree(tree, mention_ids) + self.process_tree(tree, mention_ids, entity_colors) print('') print('') - def _start_subspan(self, subspan, mention_ids, crossing=False): + def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): m = subspan.mention e = m.entity classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"}' - title = f'eid={subspan.subspan_eid}\ntype={e.etype}\nhead={m.head.form}' + title = f'eid={subspan.subspan_eid}\ntype={e.etype} ({entity_colors[e]})\nhead={m.head.form}' + if self.colors: + classes += f' {entity_colors[e]}' if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: @@ -121,9 +136,11 @@ def _start_subspan(self, subspan, mention_ids, crossing=False): title += '\ncrossing' if m.other: title += f'\n{m.other}' - print(f'', end='') #data-eid="{e.eid}" + print(f'', end='') + if self.show_eid: + print(f'{subspan.subspan_eid}', end='') - def process_tree(self, tree, mention_ids): + def process_tree(self, tree, mention_ids, entity_colors): mentions = set() nodes_and_empty = tree.descendants_and_empty for node in nodes_and_empty: @@ -135,14 +152,16 @@ def process_tree(self, tree, mention_ids): subspans.extend(mention._subspans()) subspans.sort(reverse=True) - if tree.newpar: + if tree.newdoc: + print(f'


    {tree.newdoc if tree.newdoc is not True else ""}


    ') + elif tree.newpar: print('
    ') opened = [] print(f'

    ') for node in nodes_and_empty: while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() - self._start_subspan(subspan, mention_ids) + self._start_subspan(subspan, mention_ids, entity_colors) opened.append(subspan) is_head = self._is_head(node) @@ -180,7 +199,7 @@ def process_tree(self, tree, mention_ids): opened = new_opened print('' * (len(endings) + len(brokens)), end='') for broken in brokens: - self._start_subspan(broken, mention_ids, True) + self._start_subspan(broken, mention_ids, entity_colors, True) opened.append(subspan) if not node.no_space_after: From cae7c37efe8548c2e432b108e4aa06df3b778e3a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 Feb 2023 15:07:42 +0100 Subject: [PATCH 366/871] `read.Conllu max_docs=3` will load only the first three documents This is nice for debugging coreference files, where we cannot load just first N sentences because there may be Bridge/SplitAnte referring to unknown eid. This way we load whole docs. --- udapi/block/read/conllu.py | 22 ++++++++++++++++++++-- udapi/core/basereader.py | 31 ++++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index bba69696..d5623fba 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -81,8 +81,26 @@ def parse_comment_line(self, line, root): root.comment += line[1:] + "\n" def read_trees(self): - return [self.read_tree_from_lines(s.split('\n')) for s in - self.filehandle.read().split('\n\n') if s] + if not self.max_docs: + return [self.read_tree_from_lines(s.split('\n')) for s in + self.filehandle.read().split('\n\n') if s] + # udapi.core.basereader takes care about the max_docs parameter. + # However, we can make the loading much faster by not reading + # the whole file if the user wants just first N documents. + trees, lines, loaded_docs = [], [], 0 + for line in self.filehandle: + line = line.rstrip() + if line == '': + tree = self.read_tree_from_lines(lines) + lines = [] + if tree.newdoc: + if loaded_docs == self.max_docs: + return trees + loaded_docs += 1 + trees.append(tree) + else: + lines.append(line) + return def read_tree(self): if self.filehandle is None: diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index a3b334da..a841bf1b 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -13,7 +13,8 @@ class BaseReader(Block): # pylint: disable=too-many-arguments def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', - sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, **kwargs): + sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, + max_docs=0, **kwargs): super().__init__(**kwargs) if filehandle is not None: files = None @@ -29,6 +30,8 @@ def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, e self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id self.merge = merge + self.max_docs = max_docs + self._docs_loaded = 0 # `global.Entity` is a header stored in a comment before the first tree of each document in a given CoNLL-U file. # In Udapi, it is stored in `document.meta['global.Entity']`, but for technical reasons, we need to temporarily store it in here, the reader. # The reason is that `read.Conllu` uses a fast loading interface with `read_trees()`, @@ -126,6 +129,11 @@ def try_fast_load(self, document): bundle, last_bundle_id = None, '' for root in trees: + if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return True + self._docs_loaded += 1 add_to_the_last_bundle = False if self.ignore_sent_id: @@ -180,8 +188,10 @@ def process_document(self, document): if root._sent_id is not None: bundle.bundle_id = root._sent_id.split('/', 1)[0] bundle.add_tree(root) - if root.newdoc and root.newdoc is not True: - document.meta["docname"] = root.newdoc + if root.newdoc: + self._docs_loaded += 1 + if root.newdoc is not True: + document.meta["docname"] = root.newdoc document.meta['global.Entity'] = self._global_entity document.meta['loaded_from'] = self.filename @@ -204,6 +214,17 @@ def process_document(self, document): if trees_loaded == 0: document.meta['loaded_from'] = self.filename document.meta['global.Entity'] = self._global_entity + # Parameter max_docs is primarily aimed for counting UD docs, ie. trees with newdoc. + # However, it could be useful even when working with files without the newdoc annotations, + # e.g. when using files='!*.conllu' or bundles_per_doc, in which case we count the Udapi documents + # so even if the first tree in udapi.Document does not have newdoc, we count it as a new document. + # The cases where newdoc is used are checked further below. + if not root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return + self._docs_loaded += 1 + add_to_the_last_bundle = False trees_loaded += 1 @@ -222,6 +243,9 @@ def process_document(self, document): # The `# newdoc` comment in CoNLL-U marks a start of a new document. if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return if not bundle and root.newdoc is not True: document.meta["docname"] = root.newdoc if bundle and self.split_docs: @@ -231,6 +255,7 @@ def process_document(self, document): len(orig_bundles)) self.finished = False return + self._docs_loaded += 1 # assign new/next bundle to `bundle` if needed if not bundle or not add_to_the_last_bundle: From ae34d8024d8ee95db6e1bf39581e44fc08bcbc73 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 Feb 2023 15:25:50 +0100 Subject: [PATCH 367/871] refactor code duplication --- udapi/block/write/corefhtml.py | 29 +++-------------------------- udapi/block/write/html.py | 28 +++++++++++++++------------- 2 files changed, 18 insertions(+), 39 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 0a06b7e5..c7950ce9 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -50,6 +50,8 @@ }); ''' +WRITE_HTML = udapi.block.write.html.Html() + class CorefHtml(BaseWriter): def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): @@ -90,32 +92,7 @@ def process_document(self, doc): print('') print('') diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 148b29ee..48431900 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -79,16 +79,26 @@ def process_document(self, doc): print('\n') print('

    ') + + def print_doc_json(self, doc): print('data=[') for (bundle_number, bundle) in enumerate(doc, 1): - # TODO: if not self._should_process_bundle(bundle): continue if bundle_number != 1: print(',', end='') print('{"zones":{', end='') first_zone = True desc = '' for tree in bundle.trees: - # TODO: if not self._should_process_tree(tree): continue zone = tree.zone if first_zone: first_zone = False @@ -101,24 +111,16 @@ def process_document(self, doc): print('"labels":["zone=%s","id=%s"]}' % (zone, tree.address())) desc += ',["[%s]","label"],[" ","space"]' % zone for node in tree.descendants: - desc += self.print_node(node) + desc += self.print_node_json(node) desc += r',["\n","newline"]' print(']}}}') # print desc without the extra starting comma print('},"desc":[%s]}' % desc[1:]) print('];') - print("$('#treex-view').treexView(data);") - print('''function saveTree() { - var svg_el = jQuery('svg'); - if (svg_el.length) { - var svg = new Blob([svg_el.parent().html()], {type: "image/svg+xml"}); - saveAs(svg, 'tree.svg'); - } - }''') - print('') + @staticmethod - def print_node(node): + def print_node_json(node): """JSON representation of a given node.""" # pylint does not understand `.format(**locals())` and falsely alarms for unused vars # pylint: disable=too-many-locals,unused-variable From ca4d2b7f8240a0faca55f9aad6513d9a94968a08 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 Feb 2023 19:53:25 +0100 Subject: [PATCH 368/871] write.CorefHtml: add side panel with an overview of entities --- udapi/block/write/corefhtml.py | 62 ++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index c7950ce9..280fc213 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -6,7 +6,25 @@ ETYPES = 'person place organization animal plant object substance time number abstract event'.split() +HEADER = ''' + +Udapi CorefUD viewer + +''' +# I use a pure CSS-3 solution: #overiew {resize: horizontal; overflow: auto;} +# so that the width of #overview can be changed by dragging the bottom right corner. +# The following lines would make the whole right border draggable: +# +# +# +#
    CSS = ''' +#wrap {display: flex; align-items: flex-start;} +#main {width: 100%; padding: 5px; background: white; z-index:100;} +#overview { position: sticky; top: 0; overflow-y: scroll; height:95vh; resize:horizontal; + display: grid; border-right: double; + padding: 5px; width: 20em; background: #ddd; border-radius: 5px; +} .sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} .sentence span .eid {display:block; font-size: 10px;} .showtree {float:left; margin: 5px;} @@ -23,10 +41,16 @@ $("span").click(function(e) { let was_selected = $(this).hasClass("selected"); $("span").removeClass("selected"); - if (!was_selected){$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} + if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} e.stopPropagation(); }); +window.onhashchange = function() { + $("span").removeClass("selected"); + var fragment = window.location.hash.substring(1); + if (fragment) {$("." + fragment).addClass("selected");} +} + $("span").hover( function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, function(e) {$("span").removeClass("active");} @@ -60,10 +84,18 @@ def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): self.show_eid = show_eid self.colors = colors + def _representative_word(self, entity): + # return the first PROPN or NOUN. Or the most frequent one? + heads = [m.head for m in entity.mentions] + lemma_or_form = lambda n: n.lemma if n.lemma else n.form + for upos in ('PROPN', 'NOUN'): + nodes = [n for n in heads if n.upos == upos] + if nodes: + return lemma_or_form(nodes[0]) + return lemma_or_form(heads[0]) + def process_document(self, doc): - print('') - print('Udapi CorefUD viewer') - print('') + print(HEADER) if self.show_trees: print('') print('') - print('\n') + print('\n\n
    ') mention_ids = {} entity_colors = {} @@ -86,8 +118,21 @@ def process_document(self, doc): for idx, mention in enumerate(entity.mentions, 1): mention_ids[mention] = f'{entity.eid}e{idx}' + print('
    ') + print('' + '' + '\n') + for entity in doc.coref_entities: + print(f'' + f'' + f'') + print('
    eid#mword
    {entity.eid}{len(entity.mentions)}{self._representative_word(entity)}
    ') + print('
    ') + + print('
    ') for tree in doc.trees: self.process_tree(tree, mention_ids, entity_colors) + print('
    ') print('') - print('') + print('
    ') def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): m = subspan.mention @@ -113,7 +158,10 @@ def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): title += '\ncrossing' if m.other: title += f'\n{m.other}' - print(f'', end='') + span_id = '' + if (subspan.subspan_id == '' or subspan.subspan_id.startswith('[1/')) and e.mentions[0] == m: + span_id = f'id="{e.eid}" ' + print(f'', end='') if self.show_eid: print(f'{subspan.subspan_eid}', end='') From bbd702aa35fcf4e13d2a4ab2d3972a7efd89fcc5 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 16:22:03 +0100 Subject: [PATCH 369/871] Python glob.glob does not support {dir1,dir2} anyway --- udapi/core/files.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/udapi/core/files.py b/udapi/core/files.py index 7fcd9149..c6973dad 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -58,14 +58,6 @@ def string_to_filenames(self, string): or commas. For specifying files with spaces or commas in filenames, you need to use wildcard patterns or '@' filelist. (But preferably don't use such filenames.) """ - # "!" means glob pattern which can contain {dir1,dir2} - # so it cannot be combined with separating tokens with comma. - if string[0] == '!': - pattern = string[1:] - filenames = glob.glob(pattern) - if not filenames: - raise RuntimeError('No filenames matched "%s" pattern' % pattern) - return filenames return list(itertools.chain.from_iterable(self._token_to_filenames(tok) for tok in string.replace(',', ' ').split())) From a5acaf43b1edb3468dfc493da6e7ae87f2d99966 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 17:58:45 +0100 Subject: [PATCH 370/871] ud.ComplyWithText: use node.misc['CorrectForm'] instead of node.misc['OrigForm'] which was a misleading name because the previous form value is usually not the real original form. --- udapi/block/ud/complywithtext.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index cead294a..bacc56a2 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -34,7 +34,7 @@ class ComplyWithText(Block): """Adapt the nodes to comply with the text.""" def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, - **kwargs): + previous_form_attr='CorrectForm', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. Should we edit the text to match the token forms (as a last resort)? Default=True. @@ -54,24 +54,33 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ Default=True (i.e. add the goeswith nodes if applicable). max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words). Default=4. + previous_form_attr - when changing node.form, we store the previous value + in node.misc[previous_form_attr] (so no information is lost). + Default="CorrectForm" because we expect that the previous value + (i.e. the value of node.form before applying this block) + contained the corrected spelling, while root.text contains + the original spelling with typos as found in the raw text. + CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html + When setting this parameter to an empty string, no values will be stored to node.misc. """ super().__init__(**kwargs) self.fix_text = fix_text self.prefer_mwt = prefer_mwt self.allow_goeswith = allow_goeswith self.max_mwt_length = max_mwt_length + self.allow_add_punct = allow_add_punct + self.allow_delete_punct = allow_delete_punct + self.previous_form_attr = previous_form_attr @staticmethod def allow_space(form): """Is space allowed within this token form?""" return re.fullmatch('[0-9 ]+([,.][0-9]+)?', form) - @staticmethod - def store_orig_form(node, new_form): - """Store the original form of this node into MISC, unless the change is common&expected.""" - _ = new_form + def store_previous_form(self, node): + """Store the previous form of this node into MISC, unless the change is common&expected.""" if node.form not in ("''", "``"): - node.misc['OrigForm'] = node.form + node.misc[self.previous_form_attr] = node.form def process_tree(self, root): text = root.text @@ -203,7 +212,7 @@ def solve_diff(self, nodes, form): if ' ' in form: if len(nodes) == 1 and node.form == form.replace(' ', ''): if self.allow_space(form): - self.store_orig_form(node, form) + self.store_previous_form(node) node.form = form elif self.allow_goeswith: forms = form.split() @@ -235,7 +244,7 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: - self.store_orig_form(node, form) + self.store_previous_form(node) node.form = form From a69c7a158edb91d12d2907f6802c3104d946ee0d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 18:00:46 +0100 Subject: [PATCH 371/871] ud.ComplyWithText fix_text=1 should always produce valid CoNLL-U so even if there are diffs which cannot be resolved, and thus we cannot fill SpaceAfter=No in the rest of the sentence, we must execute the "if self.fix_text:..." code, which changes the root.text (instead of changing the annotation of nodes). --- udapi/block/ud/complywithtext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index bacc56a2..1a13a4ec 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -121,7 +121,7 @@ def process_tree(self, root): node.misc['SpaceAfter'] = 'No' else: logging.warning('Node %s does not match text "%s"', node, tmp_text[:20]) - return + break # Edit root.text if needed. if self.fix_text: From fde163c32837ccc02a9b89d535be9769d4414340 Mon Sep 17 00:00:00 2001 From: "Federica Gamba (PhD" Date: Wed, 8 Feb 2023 14:23:05 +0100 Subject: [PATCH 372/871] further adjusted Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 122 ++++++++++++++++++----------- 1 file changed, 78 insertions(+), 44 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 323f60f7..111bceb9 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -29,7 +29,7 @@ def process_node(self, node): af = {} # NOUNS ################################################################ if node.upos == 'NOUN': - if not node.feats['Abbr'] == 'Yes' or node.feats['Case']: # abbreviated or indeclinable nouns + if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { 'Gender': ['Masc', 'Fem', 'Neut'], @@ -37,11 +37,11 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'Degree': ['Dim'], 'Abbr': ['Yes'], - 'Foreign': ['Yes']} + 'Foreign': ['Yes'], + 'VerbForm': ['Part']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] - af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] af['Compound'] = ['Yes'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] @@ -76,14 +76,12 @@ def process_node(self, node): 'Degree': ['Cmp', 'Sup', 'Abs'], 'Abbr': ['Yes'], 'Foreign': ['Yes'], - 'Polarity': ['Neg']} + 'Polarity': ['Neg'], + 'VerbForm': ['Part']} if self.flavio: - # Flavio does not use Degree=Pos, hence Degree is not required. - # rf = [f for f in rf if f != 'Degree'] # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Compound'] = ['Yes'] - af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] af['Degree'].append('Dim') af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] @@ -93,15 +91,16 @@ def process_node(self, node): elif node.upos == 'PRON': rf = ['PronType', 'Case'] af = { - 'PronType': ['Prs', 'Rel', 'Ind', 'Int', 'Rcp'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Proper': ['Yes'], + 'Compound': ['Yes'], + 'Polarity': ['Neg'] } if node.feats['PronType'] == 'Prs': af['Reflex'] = ['Yes'] if node.feats['Reflex'] == 'Yes': # seipsum, se rf.extend(['Person']) # seipsum has gender and number but se does not, so it is not required - # TODO: seipsum in ITTB, but why lemma seipsum instead of seipse? af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Number'] = ['Sing', 'Plur'] af['Person'] = ['3'] @@ -122,6 +121,19 @@ def process_node(self, node): rf = [f for f in rf if f != 'Case'] af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Number'] = ['Sing', 'Plur'] + # lexical check of PronTypes + af['PronType'] = [] + if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: + af['PronType'].append('Prs') + elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis']: + af['PronType'].append('Ind') + elif node.lemma in ['inuicem', 'invicem']: + af['PronType'].append('Rcp') + rf.remove('Case') + elif node.lemma in ['quicumque', 'qui', 'quisquis']: + af['PronType'].append('Rel') + if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis']: + af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatAnom', 'LatPron'] @@ -140,7 +152,9 @@ def process_node(self, node): 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'Degree': ['Cmp', 'Abs', 'Sup'], - 'Polarity': ['Neg'] + 'Polarity': ['Neg'], + 'Proper': ['Yes'], + 'PronType': [] } if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' rf.extend(['Poss', 'Person[psor]']) @@ -152,8 +166,24 @@ def process_node(self, node): if node.feats['Person[psor]'] != '3': rf.append('Number[psor]') af['Number[psor]'] = ['Sing', 'Plur'] - else: - af['PronType'] = ['Dem', 'Rel', 'Ind', 'Int', 'Tot', 'Con'] + if node.feats['PronType'] == 'Ind': + af['NumType'] = ['Card'] + # lexical check of PronTypes + if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']: + if not af['PronType'] == ['Prs']: + af['PronType'].append('Prs') + elif node.lemma in ['aliquot', 'quidam', 'quispiam', 'quivis', 'nullus', 'nonnullus', 'aliqui', 'qui', 'quilibet', 'quantuslibet', 'unus', 'uterque', 'ullus', 'multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: + af['PronType'].append('Ind') + elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: + af['PronType'].append('Tot') + if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus']: + af['PronType'].append('Rel') + elif node.lemma in ['qui', 'quantus', 'quot']: + af['PronType'].append('Int') + elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot']: + af['PronType'].append('Dem') + elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter']: + af['PronType'].append('Con') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] @@ -170,8 +200,8 @@ def process_node(self, node): rf = ['NumType', 'NumForm'] af = { 'NumType': ['Card'], - 'NumForm': ['Word', 'Roman', 'Digit'] - } + 'NumForm': ['Word', 'Roman', 'Digit'], + 'Proper': ['Yes']} # Arabic digits and Roman numerals do not have inflection features. if not re.match(r'^(Digit|Roman)$', node.feats['NumForm']): af['Gender'] = ['Masc', 'Fem', 'Neut'] @@ -186,40 +216,40 @@ def process_node(self, node): elif re.match(r'^(VERB|AUX)$', node.upos): rf = ['VerbForm', 'Aspect'] af = { - 'VerbForm': ['Inf', 'Fin', 'Part'], + 'VerbForm': ['Inf', 'Fin', 'Part', 'Conv'], 'Aspect': ['Imp', 'Inch', 'Perf', 'Prosp'], - 'Polarity': ['Neg'] + 'Polarity': ['Neg'], + 'Typo': ['Yes'] } - if not re.match(r'^(Ger|Gdv)$', node.feats['VerbForm']): + if node.feats['VerbForm'] not in ['Part', 'Conv']: rf.append('Tense') - af['Tense'] = ['Pres', 'Fut'] - if node.upos == 'VERB': # and not node.lemma.endswith('sum'): # compounds of sum + af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut'] + if node.upos == 'VERB': rf.append('Voice') af['Voice'] = ['Act', 'Pass'] - # Main verbs have aspect but auxiliaries don't. - # TODO: apparently, apparently AUXs have aspect as well - # if node.upos == 'VERB': - # rf.append('Aspect') - # af['Aspect'] = ['Imp', 'Inch', 'Perf', 'Prosp'] if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive rf.extend(['Mood', 'Person', 'Number']) - af['Tense'].extend(['Past', 'Pqp']) af['Mood'] = ['Ind', 'Sub', 'Imp'] af['Person'] = ['1', '2', '3'] af['Number'] = ['Sing', 'Plur'] elif node.feats['VerbForm'] == 'Part': rf.extend(['Gender', 'Number', 'Case']) - af['Number'] = ['Sing', 'Plur'] - af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Sing'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Neut'] af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] af['Degree'] = ['Abs', 'Cmp'] - af['Gender'] = ['Masc', 'Fem', 'Neut'] - af['Tense'].append('Past') - # else: nothing to be added for VerbForm=Inf + if node.misc['TraditionalMood'].startswith('Gerundi'): + af['Voice'] = ['Pass'] + af['Aspect'] = 'Prosp' + elif node.feats['VerbForm'] == 'Conv': + rf.extend(['Case', 'Gender', 'Number']) + af['Case'] = ['Abl', 'Acc'] + af['Gender'] = ['Masc'] + af['Number'] = ['Sing'] + af['Voice'] = ['Act'] + elif node.feats['VerbForm'] == 'Inf': + af['Tense'].remove('Pqp') if self.flavio: - # Flavio has killed Tense in his treebanks. - rf = [f for f in rf if f != 'Tense'] - af['VerbForm'].append('Vnoun') # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] if 'Degree' in af: @@ -228,23 +258,22 @@ def process_node(self, node): af['Degree'] = ['Dim'] af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] - if re.match(r'^(Part|Vnoun)$', node.feats['VerbForm']): - af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO'] - af['VerbForm'].append('Vnoun') + if re.match(r'^(Part|Conv)$', node.feats['VerbForm']): + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## elif node.upos == 'ADV': af = { - 'AdvType': ['Loc', 'Tim'], + 'AdvType': ['Loc', 'Tim'], 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], - 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'NumType': ['Card', 'Ord'], # e.g., primum 'Polarity': ['Neg'] } if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] - af['NumType'] = ['Card', 'Ord'] # e.g., primum af['VerbForm'] = ['Part'] af['Degree'].append('Dim') self.check_allowed_features(node, af) @@ -262,7 +291,8 @@ def process_node(self, node): elif re.match(r'^[CS]CONJ$', node.upos): af = { 'PronType': ['Rel', 'Con'], - 'Polarity': ['Neg']} + 'Polarity': ['Neg'], + 'Compound': ['Yes']} if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] @@ -271,10 +301,14 @@ def process_node(self, node): self.check_allowed_features(node, af) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': + rf = ['AdpType'] + af = { + 'AdpType': ['Prep', 'Post'], + 'Abbr': ['Yes'] + } if self.flavio: - af = { - 'VerbForm': ['Part'], - 'Proper': ['Yes']} + af['VerbForm'] = ['Part'], + af['Proper'] = ['Yes'] self.check_allowed_features(node, af) # THE REST: NO FEATURES ################################################ else: From 29fb09caccd678560845ea3d80b2027145231c90 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:04:56 +0100 Subject: [PATCH 373/871] improve ud.ComplyWithText for KorKor --- udapi/block/ud/complywithtext.py | 81 ++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 19 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 1a13a4ec..02904731 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -24,7 +24,7 @@ """ import difflib import logging -import re +import regex from udapi.core.block import Block from udapi.core.mwt import MWT @@ -34,6 +34,7 @@ class ComplyWithText(Block): """Adapt the nodes to comply with the text.""" def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, + allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, previous_form_attr='CorrectForm', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. @@ -54,6 +55,14 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ Default=True (i.e. add the goeswith nodes if applicable). max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words). Default=4. + allow_add_punct - allow creating punctuation-only nodes + allow_delete_punct - allow deleting extra punctuation-only nodes, + which are not represented in root.text + allow_hyphen_goeswith - if e.g. node.form=="mother-in-law" corresponds to + "mother in law" in root.text, convert it to three nodes: + node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law") + node2(form="in", deprel="goeswith", upos="X", parent=node1) + node3(form="law", deprel="goeswith", upos="X", parent=node1). previous_form_attr - when changing node.form, we store the previous value in node.misc[previous_form_attr] (so no information is lost). Default="CorrectForm" because we expect that the previous value @@ -62,6 +71,7 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ the original spelling with typos as found in the raw text. CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html When setting this parameter to an empty string, no values will be stored to node.misc. + When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. """ super().__init__(**kwargs) self.fix_text = fix_text @@ -70,17 +80,20 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ self.max_mwt_length = max_mwt_length self.allow_add_punct = allow_add_punct self.allow_delete_punct = allow_delete_punct + self.allow_hyphen_goeswith = allow_hyphen_goeswith self.previous_form_attr = previous_form_attr @staticmethod def allow_space(form): """Is space allowed within this token form?""" - return re.fullmatch('[0-9 ]+([,.][0-9]+)?', form) + return regex.fullmatch('[0-9 ]+([,.][0-9]+)?', form) def store_previous_form(self, node): """Store the previous form of this node into MISC, unless the change is common&expected.""" - if node.form not in ("''", "``"): + if node.form not in ("''", "``") and self.previous_form_attr: node.misc[self.previous_form_attr] = node.form + if self.previous_form_attr == 'CorrectForm': + node.feats['Typo'] = 'Yes' def process_tree(self, root): text = root.text @@ -190,18 +203,38 @@ def solve_diffs(self, diffs, tree_chars, char_nodes, text): for diff in diffs: edit, tree_lo, tree_hi, text_lo, text_hi = diff - # Focus only on edits of type 'replace', log insertions and deletions as failures. if edit == 'equal': - continue - if edit in ('insert', 'delete'): - logging.warning('Unable to solve token-vs-text mismatch\n%s', - _diff2str(diff, tree_chars, text)) - continue - - # Revert the splittng and solve the diff. - nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] - form = text[text_lo:text_hi] - self.solve_diff(nodes, form.strip()) + pass + elif edit == 'insert': + forms = text[text_lo:text_hi].split(' ') + if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct: + #logging.info(f'trying to add {forms} before {char_nodes[tree_lo]}') + next_node = char_nodes[tree_lo] + for f in reversed(forms): + new = next_node.create_child(form=f, deprel='punct', upos='PUNCT') + new.shift_before_node(next_node) + new.misc['Added'] = 1 + else: + logging.warning('Unable to insert nodes\n%s', + _diff2str(diff, tree_chars, text)) + elif edit == 'delete': + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + if all(regex.fullmatch('\p{P}+', n.form) for n in nodes): + if self.allow_delete_punct: + for node in nodes: + node.remove(children='rehang') + else: + logging.warning('Unable to delete punctuation nodes (try ud.ComplyWithText allow_delete_punct=1)\n%s', + _diff2str(diff, tree_chars, text)) + else: + logging.warning('Unable to delete non-punctuation nodes\n%s', + _diff2str(diff, tree_chars, text)) + else: + assert edit == 'replace' + # Revert the splittng and solve the diff. + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + form = text[text_lo:text_hi] + self.solve_diff(nodes, form.strip()) def solve_diff(self, nodes, form): """Fix a given (minimal) tokens-vs-text inconsistency.""" @@ -210,20 +243,25 @@ def solve_diff(self, nodes, form): # First, solve the cases when the text contains a space. if ' ' in form: - if len(nodes) == 1 and node.form == form.replace(' ', ''): + node_form = node.form + if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form: + node_form = node_form.replace('-', '') + if len(nodes) == 1 and node_form == form.replace(' ', ''): if self.allow_space(form): self.store_previous_form(node) node.form = form elif self.allow_goeswith: + self.store_previous_form(node) forms = form.split() node.form = forms[0] + node.feats['Typo'] = 'Yes' for split_form in reversed(forms[1:]): - new = node.create_child(form=split_form, deprel='goeswith', upos=node.upos) + new = node.create_child(form=split_form, deprel='goeswith', upos='X') new.shift_after_node(node) else: logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) else: - logging.warning('Unable to solve n:m diff:\n%s -> %s', nodes_str, form) + logging.warning(f'Unable to solve {len(nodes)}:{len(form.split(" "))} diff:\n{nodes_str} -> {form}') # Second, solve the cases when multiple nodes match one form (without any spaces). elif len(nodes) > 1: @@ -244,8 +282,13 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: - self.store_previous_form(node) - node.form = form + if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]): + new = node.create_child(form=form[len(node.form):], deprel='punct', upos='PUNCT') + new.shift_after_node(node) + new.misc['Added'] = 1 + else: + self.store_previous_form(node) + node.form = form def _nodes_to_chars(nodes): From d5a1a2a756ef13629984eb40af7b5853dbd8c7a0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:06:45 +0100 Subject: [PATCH 374/871] udapy hints when using a wrong block name or parameter name thanks to @michnov for this idea --- udapi/core/block.py | 23 +++++++++++++++++++---- udapi/core/run.py | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/udapi/core/block.py b/udapi/core/block.py index f039abce..fdcad9fa 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -1,5 +1,6 @@ """Block class represents the basic Udapi processing unit.""" import logging +import inspect def not_overridden(method): method.is_not_overridden = True @@ -14,9 +15,23 @@ class Block(object): Possible values are: process (default), skip, skip_warn, fail, delete. """ - def __init__(self, zones='all', if_empty_tree='process'): + def __init__(self, zones='all', if_empty_tree='process', **kwargs): self.zones = zones self.if_empty_tree = if_empty_tree + if kwargs: + params = set() + for cls in type(self).mro()[:-1]: + params.update(inspect.signature(cls.__init__).parameters.keys()) + params -= {'self', 'kwargs'} + raise TypeError(f"Extra parameters {kwargs}.\n" + f"Parameters of {self.block_name()} are:\n" + + '\n'.join(sorted(params))) + + def block_name(self): + module = ".".join(self.__module__.split(".")[:-1]) + if module.startswith('udapi.block.'): + module = module[12:] + return module + "." + self.__class__.__name__ def process_start(self): """A hook method that is executed before processing UD data""" @@ -73,7 +88,7 @@ def process_document(self, document): p_tree = not hasattr(self.process_tree, 'is_not_overridden') p_node = not hasattr(self.process_node, 'is_not_overridden') if not any((p_entity, p_mention, p_bundle, p_tree, p_node)): - raise Exception("No processing activity defined in block " + str(self)) + raise Exception("No processing activity defined in block " + self.block_name()) if p_entity or p_mention: for entity in document.coref_entities: @@ -85,8 +100,8 @@ def process_document(self, document): if p_bundle or p_tree or p_node: for bundle_no, bundle in enumerate(document.bundles, 1): - logging.debug('Block %s processing bundle #%d (id=%s)', - self.__class__.__name__, bundle_no, bundle.bundle_id) + logging.debug(f'Block {self.block_name()} processing ' + f'bundle #{bundle_no} (id={bundle.bundle_id})') if p_bundle: self.process_bundle(bundle) else: diff --git a/udapi/core/run.py b/udapi/core/run.py index a0cc4a9a..418baca6 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -67,6 +67,26 @@ def _parse_command_line_arguments(scenario): return block_names, block_args +def _blocks_in_a_package(package_name): + import importlib.util, pkgutil + + if not importlib.util.find_spec(package_name): + return [] + try: + package = __import__(package_name, fromlist="dummy") + submodule_names = [m.name for m in pkgutil.iter_modules(package.__path__)] + pname = package_name + if pname.startswith("udapi.block."): + pname = pname[12:] + blocks = [] + for sname in submodule_names: + module = __import__(f"{package_name}.{sname}", fromlist="dummy") + bname = [c for c in dir(module) if c.lower() == sname][0] + blocks.append(f"{pname}.{bname}") + return blocks + except: + return [] + def _import_blocks(block_names, block_args): """ Parse block names, import particular packages and call the constructor for each object. @@ -92,8 +112,17 @@ def _import_blocks(block_names, block_args): command = "from " + module + " import " + class_name + " as b" + str(block_id) logging.debug("Trying to run command: %s", command) exec(command) # pylint: disable=exec-used - except Exception: - logging.warning("Error when trying import the block %s", block_name) + except ModuleNotFoundError as err: + package_name = ".".join(module.split(".")[:-1]) + blocks = _blocks_in_a_package(package_name) + if not blocks: + raise + raise ModuleNotFoundError( + f"Cannot find block {block_name} (i.e. class {module}.{class_name})\n" + f"Available block in {package_name} are:\n" + + "\n".join(_blocks_in_a_package(package_name))) from err + except Exception as ex: + logging.warning(f"Cannot import block {block_name} (i.e. class {module}.{class_name})") raise # Run the imported module. From 49ed44d2e309523cdf3361c599934d5dbf58a2a8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:23:36 +0100 Subject: [PATCH 375/871] read.XY files='!*.conllu' should iterated over sorted files glob.glob() returns files in an arbitrary order (as `ls -U`) --- udapi/core/files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/files.py b/udapi/core/files.py index c6973dad..be59b2c0 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -65,7 +65,7 @@ def string_to_filenames(self, string): def _token_to_filenames(token): if token[0] == '!': pattern = token[1:] - filenames = glob.glob(pattern) + filenames = sorted(glob.glob(pattern)) if not filenames: raise RuntimeError('No filenames matched "%s" pattern' % pattern) elif token[0] == '@': From 1a4241104709e7647cf75ff84dbc68df3428fbe0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 9 Feb 2023 23:49:11 +0100 Subject: [PATCH 376/871] improve ud.ComplyWithText (for KorKor) --- udapi/block/ud/complywithtext.py | 70 ++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 02904731..c850018e 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -35,7 +35,8 @@ class ComplyWithText(Block): def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, - previous_form_attr='CorrectForm', **kwargs): + previous_form_label='CorrectForm', previous_text_label='CorrectText', + added_label='Added', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. Should we edit the text to match the token forms (as a last resort)? Default=True. @@ -63,8 +64,8 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law") node2(form="in", deprel="goeswith", upos="X", parent=node1) node3(form="law", deprel="goeswith", upos="X", parent=node1). - previous_form_attr - when changing node.form, we store the previous value - in node.misc[previous_form_attr] (so no information is lost). + previous_form_label - when changing node.form, we store the previous value + in node.misc[previous_form_label] (so no information is lost). Default="CorrectForm" because we expect that the previous value (i.e. the value of node.form before applying this block) contained the corrected spelling, while root.text contains @@ -72,6 +73,12 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html When setting this parameter to an empty string, no values will be stored to node.misc. When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. + previous_text_label - when we are not able to adapt the annotation to match root.text + and fix_text is True, we store the previous root.text in a CoNLL-U comment with this label. + Default="CorrectText". When setting this parameter to an empty string, + no values will be stored to root.comment. + added_label - when creating new nodes because allow_add_punct=True, we mark these nodes + as new_node.misc[added_label] = 1. Default="Added". """ super().__init__(**kwargs) self.fix_text = fix_text @@ -81,7 +88,9 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ self.allow_add_punct = allow_add_punct self.allow_delete_punct = allow_delete_punct self.allow_hyphen_goeswith = allow_hyphen_goeswith - self.previous_form_attr = previous_form_attr + self.previous_form_label = previous_form_label + self.previous_text_label = previous_text_label + self.added_label = added_label @staticmethod def allow_space(form): @@ -90,9 +99,9 @@ def allow_space(form): def store_previous_form(self, node): """Store the previous form of this node into MISC, unless the change is common&expected.""" - if node.form not in ("''", "``") and self.previous_form_attr: - node.misc[self.previous_form_attr] = node.form - if self.previous_form_attr == 'CorrectForm': + if node.form not in ("''", "``") and self.previous_form_label: + node.misc[self.previous_form_label] = node.form + if self.previous_form_label == 'CorrectForm': node.feats['Typo'] = 'Yes' def process_tree(self, root): @@ -140,7 +149,8 @@ def process_tree(self, root): if self.fix_text: computed_text = root.compute_text() if text != computed_text: - root.add_comment('ToDoOrigText = ' + root.text) + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') root.text = computed_text def unspace_diffs(self, orig_diffs, tree_chars, text): @@ -152,6 +162,10 @@ def unspace_diffs(self, orig_diffs, tree_chars, text): tree_lo += 1 if tree_chars[tree_hi - 1] == ' ': tree_hi -= 1 + if text[text_lo] == ' ': + text_lo += 1 + if text[text_hi - 1] == ' ': + text_hi -= 1 old = tree_chars[tree_lo:tree_hi] new = text[text_lo:text_hi] if old == '' and new == '': @@ -208,12 +222,11 @@ def solve_diffs(self, diffs, tree_chars, char_nodes, text): elif edit == 'insert': forms = text[text_lo:text_hi].split(' ') if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct: - #logging.info(f'trying to add {forms} before {char_nodes[tree_lo]}') next_node = char_nodes[tree_lo] for f in reversed(forms): new = next_node.create_child(form=f, deprel='punct', upos='PUNCT') new.shift_before_node(next_node) - new.misc['Added'] = 1 + new.misc[self.added_label] = 1 else: logging.warning('Unable to insert nodes\n%s', _diff2str(diff, tree_chars, text)) @@ -246,18 +259,26 @@ def solve_diff(self, nodes, form): node_form = node.form if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form: node_form = node_form.replace('-', '') - if len(nodes) == 1 and node_form == form.replace(' ', ''): - if self.allow_space(form): - self.store_previous_form(node) - node.form = form - elif self.allow_goeswith: - self.store_previous_form(node) - forms = form.split() - node.form = forms[0] - node.feats['Typo'] = 'Yes' - for split_form in reversed(forms[1:]): - new = node.create_child(form=split_form, deprel='goeswith', upos='X') + if len(nodes) == 1: + if node_form == form.replace(' ', ''): + if self.allow_space(form): + self.store_previous_form(node) + node.form = form + elif self.allow_goeswith: + self.store_previous_form(node) + forms = form.split() + node.form = forms[0] + node.feats['Typo'] = 'Yes' + for split_form in reversed(forms[1:]): + new = node.create_child(form=split_form, deprel='goeswith', upos='X') + new.shift_after_node(node) + else: + logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) + elif self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('[ \p{P}]+', form[len(node.form):]): + for punct_form in reversed(form[len(node.form):].split()): + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') new.shift_after_node(node) + new.misc[self.added_label] = 1 else: logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) else: @@ -283,9 +304,10 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]): - new = node.create_child(form=form[len(node.form):], deprel='punct', upos='PUNCT') + punct_form = form[len(node.form):] + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') new.shift_after_node(node) - new.misc['Added'] = 1 + new.misc[self.added_label] = 1 else: self.store_previous_form(node) node.form = form @@ -313,6 +335,4 @@ def _log_diffs(diffs, tree_chars, text, msg): def _diff2str(diff, tree, text): old = '|' + ''.join(tree[diff[1]:diff[2]]) + '|' new = '|' + ''.join(text[diff[3]:diff[4]]) + '|' - if diff[0] == 'equal': - return '{:7} {!s:>50}'.format(diff[0], old) return '{:7} {!s:>50} --> {!s}'.format(diff[0], old, new) From 3abb76df036f7aa2e8f39437aa7d5b80032ae850 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:08:12 +0100 Subject: [PATCH 377/871] ud.ComplyWithText fix_text=1 should always produce valid CoNLL-U even if the raw texts include double spaces or no-break spaces (TODO: alternatively, we could annotate these using SpacesAfter). --- udapi/block/ud/complywithtext.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index c850018e..351ebc01 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -109,9 +109,13 @@ def process_tree(self, root): if text is None: raise ValueError('Tree %s has no text, cannot use ud.ComplyWithText' % root) - # Normalize the stored text (double space -> single space) + # Normalize the stored text (e.g. double space or no-break space -> single space) # and skip sentences which are already ok. text = ' '.join(text.split()) + if root.text != text and self.fix_text: + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') + root.text = text if text == root.compute_text(): return From 0c6f946802345cc670ece9663fc7007ff05efd73 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:09:36 +0100 Subject: [PATCH 378/871] corefud.PrintMentions should show Entity annotations in MISC by default --- udapi/block/corefud/printmentions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 12db433a..d011f686 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -12,7 +12,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, print_total=True, print_should=True, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, - minimize_cross=True, color=True, attributes='form,upos,deprel', + minimize_cross=True, color=True, attributes='ord,form,upos,deprel,misc', print_undef_as='_', print_doc_meta=True, print_comments=False, mark='(Mark)', hints=True, layout='classic', **kwargs): From f9dd071481e49944fe6c70629bf9d56a90bd86d6 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:27:46 +0100 Subject: [PATCH 379/871] keep newdoc and global.Entity when using read.Conllu sent_id_filter=regex The global.Entity comment will be read automatically by read.Conllu and then inserted automatically by write.Conllu, but only for trees with tree.newdoc, so we need to keep this annotation as well (move it to the new first tree in a given document). --- udapi/core/basereader.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index a841bf1b..71d57159 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -97,13 +97,19 @@ def filtered_read_tree(self): tree = self.read_tree() if self.sent_id_filter is None: return tree + + skipped_newdoc = None while True: if tree is None: return None if self.sent_id_filter.match(tree.sent_id) is not None: + if skipped_newdoc and not tree.newdoc: + tree.newdoc = skipped_newdoc return tree logging.debug('Skipping sentence %s as it does not match the sent_id_filter %s.', tree.sent_id, self.sent_id_filter) + if tree.newdoc: + skipped_newdoc = tree.newdoc tree = self.read_tree() def try_fast_load(self, document): From b036d572af97a9f06482ccdcd7e90cfe4f0f5655 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 15:15:48 +0100 Subject: [PATCH 380/871] update ord of empty nodes when deleting preceding nonempty nodes TODO: add tests, solve also deleting of empty nodes --- udapi/core/node.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 618e75eb..8a764498 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -516,6 +516,7 @@ def remove(self, children=None): `rehang_warn` means to rehang and warn:-). """ self._parent._children.remove(self) + empty_follows = None if children is not None and self._children: if children.startswith('rehang'): for child in self._children: @@ -523,6 +524,16 @@ def remove(self, children=None): self._parent._children.extend(self._children) self._parent._children.sort() self._children.clear() + elif self._root.empty_nodes: + will_be_removed = self.descendants(add_self=1) + prev_nonempty = self._root + empty_follows = {} + for node in self._root.descendants_and_empty: + if node.empty: + empty_follows[node] = prev_nonempty + elif node not in will_be_removed: + prev_nonempty = node + if children.endswith('warn'): logging.warning('%s is being removed by remove(children=%s), ' ' but it has (unexpected) children', self, children) @@ -536,14 +547,29 @@ def remove(self, children=None): self._root._descendants.remove(self) except ValueError: pass # self may be an already deleted node e.g. if n.remove() called twice - for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): - node.ord = new_ord + else: + for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): + node.ord = new_ord + for empty in self._root.empty_nodes: + if empty > self: + empty.ord = round(empty.ord - 1, 1) else: # TODO nodes_to_remove = self.unordered_descendants() # and mark all nodes as deleted, remove them from MWT and coref mentions self._root._descendants = sorted(self._root.unordered_descendants()) for (new_ord, node) in enumerate(self._root._descendants, 1): node.ord = new_ord + # Decrease ord of empty nodes (keep their fractional part) + # Make sure that e.g. after deleting node with ord=2 + # ords "1 1.1 1.2 2 2.1" will become "1 1.1 1.2 1.3". + if empty_follows: + last_ord = 0 + for empty in self._root.empty_nodes: + prev_nonempty = empty_follows[empty] + new_ord = round(prev_nonempty.ord + (empty.ord % 1), 1) + while new_ord <= last_ord: + new_ord = round(new_ord + 0.1, 1) + last_ord, empty.ord = new_ord, new_ord def _shift_before_ord(self, reference_ord, without_children=False): """Internal method for changing word order.""" From 6c289d3bda8134a683f6362198888ee920520203 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 16:32:51 +0100 Subject: [PATCH 381/871] ud.ComplyWithText: the previous root.text value is better described as OrigText Unlike the previous node.form values, it is (usually) the original raw text including typos etc, so the label "CorrectText" was completely misleading. --- udapi/block/ud/complywithtext.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 351ebc01..b36b2512 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -35,7 +35,7 @@ class ComplyWithText(Block): def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, - previous_form_label='CorrectForm', previous_text_label='CorrectText', + previous_form_label='CorrectForm', previous_text_label='OrigText', added_label='Added', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. @@ -74,8 +74,8 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ When setting this parameter to an empty string, no values will be stored to node.misc. When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. previous_text_label - when we are not able to adapt the annotation to match root.text - and fix_text is True, we store the previous root.text in a CoNLL-U comment with this label. - Default="CorrectText". When setting this parameter to an empty string, + and fix_text is True, we store the previous root.text value in a CoNLL-U comment with this label. + Default="OrigText". When setting this parameter to an empty string, no values will be stored to root.comment. added_label - when creating new nodes because allow_add_punct=True, we mark these nodes as new_node.misc[added_label] = 1. Default="Added". From 043f4d73745a0155db76d5f4776d77f7ceeeba8a Mon Sep 17 00:00:00 2001 From: "Federica Gamba (PhD" Date: Fri, 17 Feb 2023 16:47:25 +0100 Subject: [PATCH 382/871] minor changes in Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 111bceb9..fde3b0bd 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -27,8 +27,11 @@ def __init__(self, flavio=False, **kwargs): def process_node(self, node): rf = [] af = {} + # PROIEL-specific: greek words without features + if node.lemma == 'greek.expression': + pass # NOUNS ################################################################ - if node.upos == 'NOUN': + elif node.upos == 'NOUN': if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { @@ -125,14 +128,14 @@ def process_node(self, node): af['PronType'] = [] if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: af['PronType'].append('Prs') - elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis']: + elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis', 'qui']: af['PronType'].append('Ind') elif node.lemma in ['inuicem', 'invicem']: af['PronType'].append('Rcp') rf.remove('Case') - elif node.lemma in ['quicumque', 'qui', 'quisquis']: + if node.lemma in ['quicumque', 'qui', 'quisquis']: af['PronType'].append('Rel') - if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis']: + if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis', 'ecqui']: af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. @@ -176,7 +179,7 @@ def process_node(self, node): af['PronType'].append('Ind') elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: af['PronType'].append('Tot') - if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus']: + if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']: af['PronType'].append('Rel') elif node.lemma in ['qui', 'quantus', 'quot']: af['PronType'].append('Int') From e84741a6e78acaaf13739945bd17814d569e3601 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Feb 2023 22:06:56 +0100 Subject: [PATCH 383/871] Remove NOCOREF entities e.g. from AnCora. --- udapi/block/corefud/removenocorefentities.py | 21 ++++++++++++++++++++ udapi/core/coref.py | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 udapi/block/corefud/removenocorefentities.py diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py new file mode 100644 index 00000000..8baba086 --- /dev/null +++ b/udapi/block/corefud/removenocorefentities.py @@ -0,0 +1,21 @@ +from udapi.core.block import Block +import udapi.core.coref +import re +import logging + +class RemoveNoCorefEntities(Block): + """ + Some corpora (e.g., AnCora) include annotation of named entities that are + not annotated for coreference. To distinguish them, their cluster ID starts + with 'NOCOREF' (optionally followed by entity type, so that one cluster + still has just one type). We may want to remove such entities from datasets + that are used to train coreference resolves, to prevent the resolvers from + thinking that all members of a NOCOREF cluster are coreferential. That is + what this block does. + """ + + def process_document(self, doc): + entities = doc.coref_entities + if not entities: + return + doc.coref_entities = [e for e in entities if not re.match(r'^NOCOREF', e.eid)] diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 12dda239..4cd656f1 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -300,7 +300,7 @@ def __init__(self, eid, etype=None): self.split_ante = [] def __lt__(self, another): - """Does this CorefEntity precedes (word-order wise) `another` entity? + """Does this CorefEntity precede (word-order wise) `another` entity? This method defines a total ordering of all entities by the first mention of each entity (see `CorefMention.__lt__`). From 16c3a48ed3eb7861757092649a6ece22b893151c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Feb 2023 22:27:19 +0100 Subject: [PATCH 384/871] Another method of removing entities. --- udapi/block/corefud/removenocorefentities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py index 8baba086..4551873c 100644 --- a/udapi/block/corefud/removenocorefentities.py +++ b/udapi/block/corefud/removenocorefentities.py @@ -18,4 +18,4 @@ def process_document(self, doc): entities = doc.coref_entities if not entities: return - doc.coref_entities = [e for e in entities if not re.match(r'^NOCOREF', e.eid)] + doc._eid_to_entity = {e._eid: e for e in entities if not re.match(r'^NOCOREF', e.eid)} From 8b442889aca3c1b881d7d53896d1eb0547635cfa Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 21 Feb 2023 15:52:18 +0100 Subject: [PATCH 385/871] CorefUD: counting sentence sequences with no coref annotation --- udapi/block/corefud/countgaps.py | 67 ++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 udapi/block/corefud/countgaps.py diff --git a/udapi/block/corefud/countgaps.py b/udapi/block/corefud/countgaps.py new file mode 100644 index 00000000..c8ee8d76 --- /dev/null +++ b/udapi/block/corefud/countgaps.py @@ -0,0 +1,67 @@ +from udapi.core.block import Block +from collections import Counter + +class CountGaps(Block): + """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" + + def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=True, **kwargs): + super().__init__(**kwargs) + self.report_per_newdoc = report_per_newdoc + self.report_per_file = report_per_file + self.report_total = report_total + self._total_counter = Counter() + + def _report_stats(self, counter=None, header_id=None): + if not counter: + counter = self._total_counter + if header_id: + print(f"============ {header_id} ============") + for key in sorted(counter): + print(f"{key:2d}: {counter[key]}") + + def _count_empty_seqs(self, empty_seqs): + counter = Counter() + for seq in empty_seqs: + counter[len(seq)] += 1 + return counter + + def process_document(self, doc): + file_counter = Counter() + empty_seqs = [] + curr_seq = [] + newdoc = None + for i, tree in enumerate(doc.trees): + if tree.newdoc: + if i: + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_counter = self._count_empty_seqs(empty_seqs) + file_counter.update(newdoc_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_counter, header_id=newdoc) + newdoc = tree.newdoc + empty_seqs = [] + curr_seq = [] + + has_mention = any(node.coref_mentions for node in tree.descendants) + if not has_mention: + curr_seq.append(tree.sent_id) + elif curr_seq: + empty_seqs.append(curr_seq) + curr_seq = [] + + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_counter = self._count_empty_seqs(empty_seqs) + file_counter.update(newdoc_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_counter, header_id=newdoc) + + if self.report_per_file: + self._report_stats(file_counter, header_id="FULL DOC") + + self._total_counter.update(file_counter) + + def process_end(self): + if self.report_total: + self._report_stats(header_id="TOTAL") From 716461fe3b67711f71a8cee028668fe34ceffef0 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 21 Feb 2023 19:22:33 +0100 Subject: [PATCH 386/871] besides sequences, counting also paragraphs with no coref mentions --- udapi/block/corefud/countgaps.py | 63 +++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/udapi/block/corefud/countgaps.py b/udapi/block/corefud/countgaps.py index c8ee8d76..fc45540a 100644 --- a/udapi/block/corefud/countgaps.py +++ b/udapi/block/corefud/countgaps.py @@ -1,5 +1,5 @@ from udapi.core.block import Block -from collections import Counter +from collections import defaultdict, Counter class CountGaps(Block): """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" @@ -9,15 +9,15 @@ def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=T self.report_per_newdoc = report_per_newdoc self.report_per_file = report_per_file self.report_total = report_total - self._total_counter = Counter() + self._total_counter = defaultdict(Counter) - def _report_stats(self, counter=None, header_id=None): - if not counter: - counter = self._total_counter + def _report_stats(self, counter, header_id=None): if header_id: print(f"============ {header_id} ============") for key in sorted(counter): print(f"{key:2d}: {counter[key]}") + print("-------") + print(f"SUM: {sum([k*counter[k] for k in counter])}") def _count_empty_seqs(self, empty_seqs): counter = Counter() @@ -26,42 +26,69 @@ def _count_empty_seqs(self, empty_seqs): return counter def process_document(self, doc): - file_counter = Counter() + file_counters = defaultdict(Counter) empty_seqs = [] + empty_pars = [] curr_seq = [] + curr_par = [] + is_empty_par = True newdoc = None for i, tree in enumerate(doc.trees): if tree.newdoc: if i: if curr_seq: empty_seqs.append(curr_seq) - newdoc_counter = self._count_empty_seqs(empty_seqs) - file_counter.update(newdoc_counter) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if is_empty_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) if self.report_per_newdoc: - self._report_stats(newdoc_counter, header_id=newdoc) + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS in {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS in {newdoc}") newdoc = tree.newdoc empty_seqs = [] + empty_pars = [] curr_seq = [] + curr_par = [] + is_empty_par = True + if tree.newpar: + if not tree.newdoc and is_empty_par: + empty_pars.append(curr_par) + curr_par = [] + is_empty_par = True has_mention = any(node.coref_mentions for node in tree.descendants) if not has_mention: curr_seq.append(tree.sent_id) - elif curr_seq: - empty_seqs.append(curr_seq) - curr_seq = [] + curr_par.append(tree.sent_id) + else: + if curr_seq: + empty_seqs.append(curr_seq) + curr_seq = [] + is_empty_par = False if curr_seq: empty_seqs.append(curr_seq) - newdoc_counter = self._count_empty_seqs(empty_seqs) - file_counter.update(newdoc_counter) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if curr_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) if self.report_per_newdoc: - self._report_stats(newdoc_counter, header_id=newdoc) + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS, {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS, {newdoc}") if self.report_per_file: - self._report_stats(file_counter, header_id="FULL DOC") + self._report_stats(file_counters["seq"], header_id="SEQ STATS, FILE") + self._report_stats(file_counters["par"], header_id="PAR STATS, FILE") - self._total_counter.update(file_counter) + self._total_counter["seq"].update(file_counters["seq"]) + self._total_counter["par"].update(file_counters["par"]) def process_end(self): if self.report_total: - self._report_stats(header_id="TOTAL") + self._report_stats(self._total_counter["seq"], header_id="SEQ STATS, TOTAL") + self._report_stats(self._total_counter["par"], header_id="PAR STATS, TOTAL") From c147469f5a4a9267902974846c6ff2d804447cdb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 25 Feb 2023 00:25:12 +0100 Subject: [PATCH 387/871] write.CorefHtml add visualization menu show: eid, trees, line breaks, paragraphs --- udapi/block/write/corefhtml.py | 39 +++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 280fc213..20f68291 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -11,7 +11,7 @@ Udapi CorefUD viewer ''' -# I use a pure CSS-3 solution: #overiew {resize: horizontal; overflow: auto;} +# I use a pure CSS-3 solution: #overview {resize: horizontal; overflow: auto;} # so that the width of #overview can be changed by dragging the bottom right corner. # The following lines would make the whole right border draggable: # @@ -25,9 +25,19 @@ display: grid; border-right: double; padding: 5px; width: 20em; background: #ddd; border-radius: 5px; } +#main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none; + padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;} +#menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;} +#menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;} +.change .b1 {transform: translate(0, 9px) rotate(-45deg);} +.change .b2 {opacity: 0;} +.change .b3 {transform: translate(0, -9px) rotate(45deg);} + .sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.sentence .tree span {border: none; padding: 0; display:inline;} .sentence span .eid {display:block; font-size: 10px;} -.showtree {float:left; margin: 5px;} +.showtree {margin: 5px; user-select: none;} +.display-inline {display: inline;} .close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} .empty {color: gray;} .sentence .singleton {border-style: dotted;} @@ -55,16 +65,22 @@ function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, function(e) {$("span").removeClass("active");} ); + +function menuclick(x) { + x.classList.toggle("change"); + $("#main-menu").toggle(); +} + ''' SCRIPT_SHOWTREE = ''' $(".sentence").each(function(index){ var sent_id = this.id; - $(this).before( + $(this).prepend( $("
    ') print('
    ') + print('\n' + '\n') for tree in doc.trees: self.process_tree(tree, mention_ids, entity_colors) print('
    ') @@ -180,7 +203,7 @@ def process_tree(self, tree, mention_ids, entity_colors): if tree.newdoc: print(f'

    {tree.newdoc if tree.newdoc is not True else ""}


    ') elif tree.newpar: - print('
    ') + print('
    ') opened = [] print(f'

    ') for node in nodes_and_empty: @@ -188,7 +211,7 @@ def process_tree(self, tree, mention_ids, entity_colors): subspan = subspans.pop() self._start_subspan(subspan, mention_ids, entity_colors) opened.append(subspan) - + is_head = self._is_head(node) if is_head: print('', end='') @@ -199,7 +222,7 @@ def process_tree(self, tree, mention_ids, entity_colors): print('', end='') if is_head: print('', end='') - + while opened and opened[-1].words[-1] == node: print('', end='') opened.pop() @@ -229,7 +252,7 @@ def process_tree(self, tree, mention_ids, entity_colors): if not node.no_space_after: print(' ', end='') - + print('

    ') def _is_head(self, node): From 0b30f5b75ab2a53ed5e0425d536094dee5c56f02 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 25 Feb 2023 02:53:43 +0100 Subject: [PATCH 388/871] more visualization options --- udapi/block/write/corefhtml.py | 65 +++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 20f68291..fd500e7d 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -11,13 +11,7 @@ Udapi CorefUD viewer ''' -# I use a pure CSS-3 solution: #overview {resize: horizontal; overflow: auto;} -# so that the width of #overview can be changed by dragging the bottom right corner. -# The following lines would make the whole right border draggable: -# -# -# -#
    + CSS = ''' #wrap {display: flex; align-items: flex-start;} #main {width: 100%; padding: 5px; background: white; z-index:100;} @@ -27,15 +21,19 @@ } #main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none; padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;} +#main-menu div {display: inline-block;} #menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;} #menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;} .change .b1 {transform: translate(0, 9px) rotate(-45deg);} .change .b2 {opacity: 0;} .change .b3 {transform: translate(0, -9px) rotate(45deg);} -.sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} -.sentence .tree span {border: none; padding: 0; display:inline;} -.sentence span .eid {display:block; font-size: 10px;} +.m {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.nobox {border:1px solid transparent; padding:0; background: transparent !important; display: inline} +.nobox .labels {display: inline;} +.nocolor {color: black !important;} +.nobold {font-weight: normal;} +.labels {display: block; font-size: 10px;} .showtree {margin: 5px; user-select: none;} .display-inline {display: inline;} .close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} @@ -48,22 +46,22 @@ ''' SCRIPT_BASE = ''' -$("span").click(function(e) { +$(".m").click(function(e) { let was_selected = $(this).hasClass("selected"); - $("span").removeClass("selected"); + $(".m").removeClass("selected"); if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} e.stopPropagation(); }); window.onhashchange = function() { - $("span").removeClass("selected"); + $(".m").removeClass("selected"); var fragment = window.location.hash.substring(1); if (fragment) {$("." + fragment).addClass("selected");} } -$("span").hover( - function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, - function(e) {$("span").removeClass("active");} +$(".m").hover( + function(e) {$(".m").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, + function(e) {$(".m").removeClass("active");} ); function menuclick(x) { @@ -94,10 +92,11 @@ class CorefHtml(BaseWriter): - def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): + def __init__(self, show_trees=True, show_eid=False, show_etype=False, colors=7, **kwargs): super().__init__(**kwargs) self.show_trees = show_trees self.show_eid = show_eid + self.show_etype = show_etype self.colors = colors def _representative_word(self, entity): @@ -120,6 +119,10 @@ def process_document(self, doc): if self.colors: for i in range(self.colors): print(f'.c{i} {{color: hsl({int(i * 360/self.colors)}, 100%, 30%);}}') + if not self.show_eid: + print('.eid {display: none;}') + if not self.show_etype: + print('.etype {display: none;}') print('') print('\n\n
    ') @@ -146,13 +149,19 @@ def process_document(self, doc): print('
    ') print('
    ') - print('\n' '\n') - for tree in doc.trees: - self.process_tree(tree, mention_ids, entity_colors) - print('
    ') - print('') print('
    ') - def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): + def _start_subspan(self, subspan, crossing=False): m = subspan.mention e = m.entity - classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"} m' + classes = f'{e.eid} {self._mention_ids[m]} {e.etype or "other"} m' title = f'eid={subspan.subspan_eid}\netype={e.etype}\nhead={m.head.form}' if self.colors: - classes += f' {entity_colors[e]}' + classes += f' {self._entity_colors[e]}' if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: @@ -252,7 +303,7 @@ def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): f'{subspan.subspan_eid}' f' {e.etype}', end='') - def process_tree(self, tree, mention_ids, entity_colors): + def process_tree(self, tree): mentions = set() nodes_and_empty = tree.descendants_and_empty for node in nodes_and_empty: @@ -273,7 +324,7 @@ def process_tree(self, tree, mention_ids, entity_colors): for node in nodes_and_empty: while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() - self._start_subspan(subspan, mention_ids, entity_colors) + self._start_subspan(subspan) opened.append(subspan) is_head = self._is_head(node) @@ -311,7 +362,7 @@ def process_tree(self, tree, mention_ids, entity_colors): opened = new_opened print('' * (len(endings) + len(brokens)), end='') for broken in brokens: - self._start_subspan(broken, mention_ids, entity_colors, True) + self._start_subspan(broken, True) opened.append(subspan) if not node.no_space_after: diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 48431900..ae85d43c 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -79,7 +79,9 @@ def process_document(self, doc): print('\n') print('
    ') def print_doc_json(self, doc): - print('data=[') + print('[') for (bundle_number, bundle) in enumerate(doc, 1): if bundle_number != 1: print(',', end='') print('{"zones":{', end='') first_zone = True desc = '' - for tree in bundle.trees: + try: + trees = bundle.trees + except: + trees = [bundle] # allow to call print_doc_json([tree1, tree2]) + for tree in trees: zone = tree.zone if first_zone: first_zone = False @@ -116,7 +122,7 @@ def print_doc_json(self, doc): print(']}}}') # print desc without the extra starting comma print('},"desc":[%s]}' % desc[1:]) - print('];') + print(']') @staticmethod From 327bb6f9083f6131b4f986dac9b56f2570957f60 Mon Sep 17 00:00:00 2001 From: Federica Gamba Date: Thu, 30 Mar 2023 12:22:27 +0200 Subject: [PATCH 393/871] adjustments in Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 74 +++++++++++++++++++----------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index fde3b0bd..dce4592d 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -28,7 +28,8 @@ def process_node(self, node): rf = [] af = {} # PROIEL-specific: greek words without features - if node.lemma == 'greek.expression': + # LLCT-specific: corrupted nodes + if node.lemma in ['greek.expression', 'missing^token']: pass # NOUNS ################################################################ elif node.upos == 'NOUN': @@ -41,12 +42,14 @@ def process_node(self, node): 'Degree': ['Dim'], 'Abbr': ['Yes'], 'Foreign': ['Yes'], - 'VerbForm': ['Part']} + 'VerbForm': ['Part', 'Vnoun']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Proper'] = ['Yes'] + af['Polarity'] = ['Neg'] af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) self.check_allowed_features(node, af) @@ -61,10 +64,10 @@ def process_node(self, node): 'Abbr': ['Yes'], 'Foreign': ['Yes']} if self.flavio: - af['Compound'] = 'Yes' + af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] - if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADJECTIVES ########################################################### @@ -72,7 +75,7 @@ def process_node(self, node): if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: rf = ['Gender', 'Number', 'Case'] af = { - 'NumType': ['Ord', 'Dist'], + 'NumType': ['Dist', 'Mult', 'Ord'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], @@ -83,9 +86,10 @@ def process_node(self, node): 'VerbForm': ['Part']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] + af['Variant'] = ['Greek'] af['Degree'].append('Dim') af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) @@ -112,10 +116,10 @@ def process_node(self, node): rf.extend(['Person', 'Number']) af['Person'] = ['1', '2', '3'] af['Number'] = ['Sing', 'Plur'] - # 1st and 2nd person do not have gender + # 3rd person must have gender if node.feats['Person'] == '3': # is, id rf.append('Gender') - af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] elif re.match(r'^(Rel|Int)$', node.feats['PronType']): rf.extend(['Gender', 'Number']) af['Gender'] = ['Masc', 'Fem', 'Neut'] @@ -126,20 +130,20 @@ def process_node(self, node): af['Number'] = ['Sing', 'Plur'] # lexical check of PronTypes af['PronType'] = [] - if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: + if node.lemma in ['ego', 'tu', 'is', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'egoipse', 'egometipse', 'tumetipse', 'semetipse', 'nosmetipse']: af['PronType'].append('Prs') - elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis', 'qui']: + elif node.lemma in ['aliquis', 'nemo', 'nihil', 'nihilum', 'qui', 'quis', 'quisquis', 'quiuis', 'quivis']: af['PronType'].append('Ind') elif node.lemma in ['inuicem', 'invicem']: af['PronType'].append('Rcp') rf.remove('Case') - if node.lemma in ['quicumque', 'qui', 'quisquis']: + if node.lemma in ['qui', 'quicumque', 'quisquis']: af['PronType'].append('Rel') - if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis', 'ecqui']: + if node.lemma in [ 'ecquis', 'ecqui', 'numquis', 'qui', 'quis', 'quisnam']: af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['LatAnom', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurO', 'IndEurX', 'LatAnom', 'LatPron'] af['Compound'] = ['Yes'] af['Polarity'] = ['Neg'] af['Form'] = ['Emp'] @@ -175,25 +179,26 @@ def process_node(self, node): if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']: if not af['PronType'] == ['Prs']: af['PronType'].append('Prs') - elif node.lemma in ['aliquot', 'quidam', 'quispiam', 'quivis', 'nullus', 'nonnullus', 'aliqui', 'qui', 'quilibet', 'quantuslibet', 'unus', 'uterque', 'ullus', 'multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: + elif node.lemma in ['aliquantus', 'aliqui', 'aliquot', 'quidam', 'nonnullus', 'nullus', 'quantuscumque', 'quantuslibet', 'qui', 'quilibet', 'quispiam', 'quiuis', 'quivis', 'quotlibet', 'ullus', 'unus', 'uterque','multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: af['PronType'].append('Ind') elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: af['PronType'].append('Tot') if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']: af['PronType'].append('Rel') - elif node.lemma in ['qui', 'quantus', 'quot']: + if node.lemma in ['qui', 'quantus', 'quot']: af['PronType'].append('Int') - elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot']: + elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot', 'praedictus', 'praefatus', 'suprascriptus']: af['PronType'].append('Dem') - elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter']: + elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter', 'uterlibet', 'uterque']: af['PronType'].append('Con') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] af['Compound'] = ['Yes'] af['Form'] = ['Emp'] af['NumType'] = ['Card'] af['Degree'].append('Dim') + af['PronType'].append('Art') if re.match(r'^(unus|ambo)', node.lemma): af['NumValue'] = ['1', '2'] self.check_required_features(node, rf) @@ -202,7 +207,7 @@ def process_node(self, node): elif node.upos == 'NUM': rf = ['NumType', 'NumForm'] af = { - 'NumType': ['Card'], + 'NumType': ['Card', 'Ord'], 'NumForm': ['Word', 'Roman', 'Digit'], 'Proper': ['Yes']} # Arabic digits and Roman numerals do not have inflection features. @@ -212,7 +217,9 @@ def process_node(self, node): af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. # e.g. duodecim - af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['NumForm'].append('Reference') + af['Compound'] = ['Yes'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # VERBS AND AUXILIARIES ################################################ @@ -227,7 +234,7 @@ def process_node(self, node): if node.feats['VerbForm'] not in ['Part', 'Conv']: rf.append('Tense') af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut'] - if node.upos == 'VERB': + if node.upos == 'VERB' or (node.upos == 'AUX' and node.lemma != 'sum'): rf.append('Voice') af['Voice'] = ['Act', 'Pass'] if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive @@ -255,6 +262,7 @@ def process_node(self, node): if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] + af['VerbType'] = ['Mod'] if 'Degree' in af: af['Degree'].append('Dim') else: @@ -262,7 +270,12 @@ def process_node(self, node): af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] if re.match(r'^(Part|Conv)$', node.feats['VerbForm']): - af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU'] + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + elif node.feats['VerbForm'] == 'Inf': + af['Case'] = ['Nom', 'Acc', 'Abl'] + af['Gender'] = ['Neut'] + af['Number'] = ['Sing'] + af['InflClass[nominal]'] = ['Ind'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## @@ -271,13 +284,13 @@ def process_node(self, node): 'AdvType': ['Loc', 'Tim'], 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], - 'NumType': ['Card', 'Ord'], # e.g., primum + 'NumType': ['Card', 'Mult', 'Ord'], # e.g., primum 'Polarity': ['Neg'] } if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] - af['VerbForm'] = ['Part'] + af['VerbForm'] = ['Fin', 'Part'] af['Degree'].append('Dim') self.check_allowed_features(node, af) # PARTICLES ############################################################ @@ -289,6 +302,7 @@ def process_node(self, node): if self.flavio: af['Form'] = ['Emp'] af['PronType'] = ['Dem'] + af['Compound'] = ['Yes'] self.check_allowed_features(node, af) # CONJUNCTIONS ######################################################### elif re.match(r'^[CS]CONJ$', node.upos): @@ -301,6 +315,8 @@ def process_node(self, node): af['Form'] = ['Emp'] af['VerbForm'] = ['Fin'] af['NumType'] = ['Card'] + af['ConjType'] = ['Expl'] + af['AdvType'] = ['Loc'] self.check_allowed_features(node, af) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': @@ -310,9 +326,13 @@ def process_node(self, node): 'Abbr': ['Yes'] } if self.flavio: - af['VerbForm'] = ['Part'], + af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] + af['Compound'] = ['Yes'] self.check_allowed_features(node, af) + # X ########################################################## + elif node.upos == 'X': + af = {'Abbr': ['Yes']} # THE REST: NO FEATURES ################################################ else: self.check_allowed_features(node, {}) From 1ddfce4aec593e222a0e3d26e8f74acf561d1356 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 31 Mar 2023 19:42:35 +0200 Subject: [PATCH 394/871] gzip the docs/* json and html files --- udapi/block/write/corefhtml.py | 49 ++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index cd0db1e5..6129b335 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -17,6 +17,7 @@ from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention from collections import Counter import udapi.block.write.html +import gzip import sys import os @@ -26,6 +27,7 @@ Udapi CorefUD viewer + ''' CSS = ''' @@ -87,21 +89,26 @@ $("#main-menu").toggle(); } -function load_doc(doc_num) { +async function load_doc(doc_num) { loading_now = true; - console.log("loading doc" + doc_num + ".html"); - $.get(docs_dir + "/doc" + doc_num + ".html", function(data){ - $("#main").append(data); - add_mention_listeners($("#doc" + doc_num + " .m")); - $("#doc" + doc_num + " .sentence").each(add_show_tree_button); - loading_now = false; - }).fail(function(){ + let filename = docs_dir + "/doc" + doc_num + ".html.gz" + console.log("loading " + filename); + try { + const res = await fetch(filename); + let raw = await res.arrayBuffer(); + data = pako.inflate(raw, {to: "string"}); + } catch (error){ if (! load_fail_reported) { load_fail_reported = true; - alert("Cannot load " + docs_dir + "/doc" + doc_num - + ".html\\nLocal files do not support lazy loading. Run a web server 'python -m http.server'"); + alert("Cannot load " + filename + "\\nLocal files do not support lazy loading." + + " Run a web server 'python -m http.server'\\n" + + "error = " + error); } - }); + } + $("#main").append(data); + add_mention_listeners($("#doc" + doc_num + " .m")); + $("#doc" + doc_num + " .sentence").each(add_show_tree_button); + loading_now = false; } var docs_loaded = 1; @@ -126,7 +133,7 @@ add_show_tree_button = function(index, el){ var sent_id = el.id; $(el).prepend( - $("