diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 00000000..988f321d --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,60 @@ +# Use the latest 2.1 version of CircleCI pipeline process engine. +# See: https://circleci.com/docs/2.0/configuration-reference +version: 2.1 + +# Orbs are reusable packages of CircleCI configuration that you may share across projects, enabling you to create encapsulated, parameterized commands, jobs, and executors that can be used across multiple projects. +# See: https://circleci.com/docs/2.0/orb-intro/ +orbs: + # The python orb contains a set of prepackaged CircleCI configuration you can use repeatedly in your configuration files + # Orb commands and jobs help you with common scripting around a language/tool + # so you dont have to copy and paste it everywhere. + # See the orb documentation here: https://circleci.com/developer/orbs/orb/circleci/python + python: circleci/python@1.5.0 + +# Define a job to be invoked later in a workflow. +# See: https://circleci.com/docs/2.0/configuration-reference/#jobs +jobs: + build-and-test: # This is the name of the job, feel free to change it to better match what you're trying to do! + # These next lines defines a Docker executors: https://circleci.com/docs/2.0/executor-types/ + # You can specify an image from Dockerhub or use one of the convenience images from CircleCI's Developer Hub + # A list of available CircleCI Docker convenience images are available here: https://circleci.com/developer/images/image/cimg/python + # The executor is the environment in which the steps below will be executed - below will use a python 3.10.2 container + # Change the version below to your required version of python + docker: + - image: cimg/python:3.9 + # Checkout the code as the first step. This is a dedicated CircleCI step. + # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default. + # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt. + # Then run your tests! + # CircleCI will report the results back to your VCS provider. + steps: + - checkout + - python/install-packages: + pkg-manager: pip + # app-dir: ~/project/package-directory/ # If you're requirements.txt isn't in the root directory. + # pip-dependency-file: test-requirements.txt # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements. + - run: + name: Install Udapi + command: pip install ".[test]" + - run: mkdir -p test-results + - run: + name: Run pytest tests + # This assumes pytest is installed via the install-package step above + command: pytest --junitxml=test-results/junit.xml -o junit_family=legacy + - store_test_results: + path: test-results + - run: + name: Color TextModeTrees + command: udapy read.Conllu files=udapi/core/tests/data/babinsky.conllu write.TextModeTrees color=1 + - run: + name: External tests + command: cd udapi/core/tests && ./external_tests.sh + + +# Invoke jobs via workflows +# See: https://circleci.com/docs/2.0/configuration-reference/#workflows +workflows: + sample: # This is the name of the workflow, feel free to change it to better match your workflow. + # Inside the workflow, you define the jobs you want to run. + jobs: + - build-and-test diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..8804cc4e --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,23 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Currently, RTD needs to select an OS with OpenSSL>=1.1.1 because of +# urllib3's dependence on that system library. (alternately, pin urllib3<2 +# See https://github.com/urllib3/urllib3/issues/2168 +build: + os: ubuntu-22.04 + tools: + python: "3.10" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + fail_on_warning: false + +python: + install: + - requirements: docs/requirements.txt diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 417e39fb..00000000 --- a/.travis.yml +++ /dev/null @@ -1,34 +0,0 @@ -language: python -python: - - "3.6" - - "3.7" - - "3.8" - - "3.9" -#before_install: -# - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test -# - sudo apt-get update -qq -# - sudo apt-get install -qq gcc-4.8 g++-4.8 -# - CC=g++-4.8 pip install ufal.udpipe -#install: -# - python setup.py install -install: - - pip3 install ".[test]" -script: - - python -m pytest - - udapy read.Conllu files=udapi/core/tests/data/babinsky.conllu write.TextModeTrees color=1 - - cd udapi/core/tests && ./external_tests.sh -jobs: - include: - - name: "Python 3.9 on Windows" - os: windows - language: shell - before_install: - - choco install python - - python --version - - python -m pip install --upgrade pip - - pip3 install --upgrade pytest - env: PATH=/c/Python39:/c/Python39/Scripts:$PATH - script: - - python -c 'import colorama;print("\033[31m some red text")' - - python -Xutf8 -c 'import udapi;udapi.Document("udapi/core/tests/data/babinsky.conllu").draw(color=1)' - - python -m pytest diff --git a/CHANGES.txt b/CHANGES.txt index 77d72548..49dfd40e 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,16 @@ Udapi Change Log ---------------- See https://github.com/udapi/udapi-python/commits/master for details. +0.4.0 2024-03-28 + - support for CorefUD 1.3 + - edits by Dan Zeman in block.ud.* + - requires Python 3.9+ (difficult to test older versions in Circle-CI) + +0.3.0 2022-04-06 + - support for CorefUD 1.0 (new CoNLL-U format for coreference annotation) + - edits by Dan Zeman in block.ud.* + - Circle-CI (instead of Travis-CI) + 0.2.3 2021-02-23 - support for enhanced dependencies and coreference - requires Python 3.6+ due to f-strings diff --git a/README.md b/README.md index 3bf52eec..0b41297f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # udapi-python Python framework for processing Universal Dependencies data -[![Build Status](https://travis-ci.org/udapi/udapi-python.svg?branch=master)](https://travis-ci.org/udapi/udapi-python) +[![Build Status](https://circleci.com/gh/udapi/udapi-python.svg?style=shield)](https://circleci.com/gh/udapi/udapi-python) [![Website](https://img.shields.io/website-up-down-green-red/http/udapi.github.io.svg)](http://udapi.github.io) [![Documentation Status](https://readthedocs.org/projects/udapi/badge/)](http://udapi.readthedocs.io) diff --git a/bin/udapy b/bin/udapy index 528e3577..30cb2595 100755 --- a/bin/udapy +++ b/bin/udapy @@ -71,6 +71,20 @@ else: logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(message)s', level=level) +# Global flag to track if an unhandled exception occurred +_unhandled_exception_occurred = False + +def _custom_excepthook(exc_type, exc_value, traceback): + global _unhandled_exception_occurred + _unhandled_exception_occurred = True + + # Call the default excepthook to allow normal error reporting + sys.__excepthook__(exc_type, exc_value, traceback) + +# Override the default excepthook +sys.excepthook = _custom_excepthook + + # Process and provide the scenario. if __name__ == "__main__": @@ -86,7 +100,13 @@ if __name__ == "__main__": # Udapi documents have a many cyclic references, so running GC is quite slow. if not args.gc: gc.disable() - atexit.register(os._exit, 0) + # When an exception/error has happened, udapy should exit with a non-zero exit code, + # so that users can use `udapy ... || echo "Error detected"` (or Makefile reports errors). + # However, we cannot use `atexit.register(lambda: os._exit(1 if sys.exc_info()[0] else 0))` + # because the Python has already exited the exception-handling block + # (the exception/error has been already reported and sys.exc_info()[0] is None). + # We thus keep record whether _unhandled_exception_occurred. + atexit.register(lambda: os._exit(1 if _unhandled_exception_occurred else 0)) atexit.register(sys.stderr.flush) if args.save: args.scenario = args.scenario + ['write.Conllu'] diff --git a/bin/udapy.bat b/bin/udapy.bat new file mode 100644 index 00000000..013e08e7 --- /dev/null +++ b/bin/udapy.bat @@ -0,0 +1,4 @@ +@REM The Python launcher "py" must be accessible via the PATH environment variable. +@REM We assume that this batch script lies next to udapy in udapi-python/bin. +@REM The PYTHONPATH environment variable must contain path to udapi-python. +py %~dp$PATH:0\udapy %* diff --git a/docs/conf.py b/docs/conf.py index 45966b57..b7d0f6e5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,7 +51,7 @@ # General information about the project. project = 'Udapi' -copyright = '2017, Martin Popel' +copyright = '2023, Martin Popel' author = 'Martin Popel' # The version info for the project you're documenting, acts as replacement for @@ -61,14 +61,14 @@ # The short X.Y version. version = '0' # The full version, including alpha/beta/rc tags. -release = '2' +release = '3' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -167,7 +167,7 @@ def run_apidoc(_): module = os.path.abspath(os.path.join(cur_dir, "..", "udapi")) print(module) - from sphinx.apidoc import main + from sphinx.ext.apidoc import main main(['--separate', '-o', cur_dir, module, '--force']) def setup(app): diff --git a/docs/requirements.txt b/docs/requirements.txt index a994db47..a537f220 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,2 +1,4 @@ -colorama +colorama>=0.4.6 termcolor +ufal.udpipe +sphinx_rtd_theme diff --git a/requirements.txt b/requirements.txt index 647361f7..044d3af7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -colorama +colorama>=0.4.6 termcolor ufal.udpipe diff --git a/setup.cfg b/setup.cfg index 4e96f81a..3ac1ebf2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = udapi -version = 0.2.3 +version = 0.4.0 author = Martin Popel author_email = popel@ufal.mff.cuni.cz description = Python framework for processing Universal Dependencies data @@ -14,7 +14,7 @@ classifiers = [options] packages = find: -python_requires = >=3.6 +python_requires = >=3.9 include_package_data = True scripts = bin/udapy diff --git a/test-requirements.txt b/test-requirements.txt new file mode 100644 index 00000000..e079f8a6 --- /dev/null +++ b/test-requirements.txt @@ -0,0 +1 @@ +pytest diff --git a/tutorial/01-visualizing.ipynb b/tutorial/01-visualizing.ipynb index 382bb11f..70bea240 100644 --- a/tutorial/01-visualizing.ipynb +++ b/tutorial/01-visualizing.ipynb @@ -526,7 +526,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the next tutorial, [02-blocks.ipynb](02-blocks.ipynb), we will explore several useful Udapi blocks, some of which may be handy when working further on Exercise 2 or similar tasks." + "In the next tutorial, 02-blocks.ipynb (not finished yet), we will explore several useful Udapi blocks, some of which may be handy when working further on Exercise 2 or similar tasks." ] } ], diff --git a/tutorial/README.md b/tutorial/README.md index 05e96d59..425f7df5 100644 --- a/tutorial/README.md +++ b/tutorial/README.md @@ -6,4 +6,4 @@ Don't display the tutorial `ipynb` files on GitHub because it cannot render the If you don't have Jupyter installed, you can display the tutorial with https://nbviewer.jupyter.org, using the following links: - [01-visualizing.ipynb](https://nbviewer.jupyter.org/github/udapi/udapi-python/blob/master/tutorial/01-visualizing.ipynb) -- [02-blocks.ipynb](https://nbviewer.jupyter.org/github/udapi/udapi-python/blob/master/tutorial/01-blocks.ipynb) +- 02-blocks.ipynb (not finished yet) diff --git a/tutorial/udapi-tutorial-dz.odt b/tutorial/udapi-tutorial-dz.odt new file mode 100644 index 00000000..a954e9f3 Binary files /dev/null and b/tutorial/udapi-tutorial-dz.odt differ diff --git a/tutorial/udapi-tutorial-dz.pdf b/tutorial/udapi-tutorial-dz.pdf new file mode 100644 index 00000000..91312187 Binary files /dev/null and b/tutorial/udapi-tutorial-dz.pdf differ diff --git a/udapi/block/corefud/concatmentionmisc.py b/udapi/block/corefud/concatmentionmisc.py index aeb945a8..74483368 100644 --- a/udapi/block/corefud/concatmentionmisc.py +++ b/udapi/block/corefud/concatmentionmisc.py @@ -14,11 +14,11 @@ def process_tree(self,root): index = matchObj.group(2) finalattr = 'MentionMisc'+index - value = node.misc[attrname] - + value = node.misc[attrname].replace(",", "%2C") + if finalattr not in node.misc: node.misc[finalattr] = f'{innerattrib}:{value}' else: - node.misc[finalattr] += f' {innerattrib}:{value}' + node.misc[finalattr] += f',{innerattrib}:{value}' del node.misc[attrname] diff --git a/udapi/block/corefud/countgaps.py b/udapi/block/corefud/countgaps.py new file mode 100644 index 00000000..fc45540a --- /dev/null +++ b/udapi/block/corefud/countgaps.py @@ -0,0 +1,94 @@ +from udapi.core.block import Block +from collections import defaultdict, Counter + +class CountGaps(Block): + """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" + + def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=True, **kwargs): + super().__init__(**kwargs) + self.report_per_newdoc = report_per_newdoc + self.report_per_file = report_per_file + self.report_total = report_total + self._total_counter = defaultdict(Counter) + + def _report_stats(self, counter, header_id=None): + if header_id: + print(f"============ {header_id} ============") + for key in sorted(counter): + print(f"{key:2d}: {counter[key]}") + print("-------") + print(f"SUM: {sum([k*counter[k] for k in counter])}") + + def _count_empty_seqs(self, empty_seqs): + counter = Counter() + for seq in empty_seqs: + counter[len(seq)] += 1 + return counter + + def process_document(self, doc): + file_counters = defaultdict(Counter) + empty_seqs = [] + empty_pars = [] + curr_seq = [] + curr_par = [] + is_empty_par = True + newdoc = None + for i, tree in enumerate(doc.trees): + if tree.newdoc: + if i: + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if is_empty_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS in {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS in {newdoc}") + newdoc = tree.newdoc + empty_seqs = [] + empty_pars = [] + curr_seq = [] + curr_par = [] + is_empty_par = True + if tree.newpar: + if not tree.newdoc and is_empty_par: + empty_pars.append(curr_par) + curr_par = [] + is_empty_par = True + + has_mention = any(node.coref_mentions for node in tree.descendants) + if not has_mention: + curr_seq.append(tree.sent_id) + curr_par.append(tree.sent_id) + else: + if curr_seq: + empty_seqs.append(curr_seq) + curr_seq = [] + is_empty_par = False + + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if curr_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS, {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS, {newdoc}") + + if self.report_per_file: + self._report_stats(file_counters["seq"], header_id="SEQ STATS, FILE") + self._report_stats(file_counters["par"], header_id="PAR STATS, FILE") + + self._total_counter["seq"].update(file_counters["seq"]) + self._total_counter["par"].update(file_counters["par"]) + + def process_end(self): + if self.report_total: + self._report_stats(self._total_counter["seq"], header_id="SEQ STATS, TOTAL") + self._report_stats(self._total_counter["par"], header_id="PAR STATS, TOTAL") diff --git a/udapi/block/corefud/delete.py b/udapi/block/corefud/delete.py new file mode 100644 index 00000000..4e68e8dd --- /dev/null +++ b/udapi/block/corefud/delete.py @@ -0,0 +1,93 @@ +"""Delete coreference annotation (Entity|Bridge|SplitAnte) and optionally also empty nodes.""" + +from udapi.core.block import Block +import udapi.core.coref +import logging + +class Delete(Block): + + def __init__(self, coref=True, empty=False, misc=False, **kwargs): + """Args: + coref: delete coreference attributes in MISC, i.e (Entity|Bridge|SplitAnte) + empty: delete all empty nodes and references to them (from DEPS and MISC[Functor]) + misc: delete all attributes in MISC except for SpaceAfter + """ + super().__init__(**kwargs) + self.coref = coref + self.empty = empty + self.misc = misc + + def is_root_reachable_by_deps(self, node, parents_to_ignore=None): + """ Check if the root node is reachable from node, possibly after deleting the parents_to_ignore nodes. + """ + stack = [(node, [])] + while stack: + proc_node, path = stack.pop() + # root is reachable + if proc_node == node.root: + break + # path forms a cycle, the root cannot be reached through this branch + if proc_node in path: + continue + for dep in proc_node.deps: + # the root cannot be reached through ignored nodes + if dep['parent'] in parents_to_ignore: + continue + # process the parent recursively + stack.append((dep['parent'], path + [proc_node])) + else: + return False + return True + + def _deps_ignore_nodes(self, node, parents_to_ignore): + """ Retrieve deps from the node, recursively ignoring specified parents. + """ + newdeps = [] + stack = [(node, [])] + while stack: + proc_node, skipped_nodes = stack.pop() + # if there is a cycle of skipped nodes, ground the subtree to the root + if proc_node in skipped_nodes: + newdeps.append({'parent': node.root, 'deprel': 'root'}) + continue + for dep in proc_node.deps: + # keep deps with a parent that shouldn't be ignored + if not dep['parent'] in parents_to_ignore: + newdeps.append(dep) + continue + # process the ignored parent recursively + stack.append((dep['parent'], skipped_nodes + [proc_node])) + return newdeps + + def process_document(self, doc): + # This block should work both with coreference loaded (deserialized) and not. + if self.coref: + doc._eid_to_entity = None + for root in doc.trees: + if self.empty: + for node in root.descendants: + # process only the nodes dependent on empty nodes + if not '.' in node.raw_deps: + continue + # just remove empty parents if the root remains reachable + if self.is_root_reachable_by_deps(node, root.empty_nodes): + node.deps = [dep for dep in node.deps if not dep['parent'] in root.empty_nodes] + # otherwise propagate to non-empty ancestors + else: + newdeps = self._deps_ignore_nodes(node, root.empty_nodes) + newdeps_sorted = sorted(set((dep['parent'].ord, dep['deprel']) for dep in newdeps)) + node.raw_deps = '|'.join(f"{p}:{r}" for p, r in newdeps_sorted) + + if '.' in node.misc['Functor'].split(':')[0]: + del node.misc['Functor'] + root.empty_nodes = [] + + if self.coref or self.misc: + for node in root.descendants + root.empty_nodes: + if self.misc: + node.misc = 'SpaceAfter=No' if node.no_space_after else None + if self.coref: + node._mentions = [] + if not self.misc: + for attr in ('Entity', 'Bridge', 'SplitAnte'): + del node.misc[attr] diff --git a/udapi/block/corefud/fixcorefud02.py b/udapi/block/corefud/fixcorefud02.py new file mode 100644 index 00000000..1575cea6 --- /dev/null +++ b/udapi/block/corefud/fixcorefud02.py @@ -0,0 +1,56 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +NEW_ETYPE = { + "misc": "other", + "date": "time", + "loc": "place", + "location": "place", + "per": "person", + "org": "organization", + "_": "", + } + +class FixCorefUD02(Block): + """Fix errors in CorefUD 0.2 for release of CorefUD 1.0.""" + + def process_document(self, doc): + # For GUM + if doc.meta['global.Entity'] == 'entity-GRP-infstat-MIN-coref_type-identity': + doc.meta['global.Entity'] = 'eid-etype-head-other-infstat-minspan-identity' + + for entity in doc.coref_entities: + if entity.etype: + # Harmonize etype. + # If gen/spec is distinguished, store it in all mentions' other['gstype']. + etype = entity.etype.lower() + if etype.startswith('spec') or etype.startswith('gen'): + gstype = 'gen' if etype.startswith('gen') else 'spec' + for m in entity.mentions: + m.other['gstype'] = gstype + if etype == 'spec': + etype = 'other' + etype = etype.replace('gen', '').replace('spec', '').replace('.', '') + etype = NEW_ETYPE.get(etype, etype) + + # etype="APPOS" is used only in NONPUBL-CorefUD_English-OntoNotes. + # Apposition is a mention-based rather than entity-based attribute. + # We don't know which of the mentions it should be assigned, but let's expect all non-first. + # UD marks appositions with deprel appos, so once someone checks it is really redunant, + # TODO we can delete the appos mention attribute. + if etype == 'appos': + etype = '' + for mention in entity.mentions[1:]: + mention.other['appos'] = '1' + entity.etype = etype + + for mention in entity.mentions: + # Harmonize bridge relation labels + for bridge in mention.bridging: + rel = bridge.relation.lower() + if rel.endswith('-inv'): + rel = 'i' + rel.replace('-inv', '') + rel = rel.replace('-', '') + rel = rel.replace('indirect_', '') + bridge.relation = rel diff --git a/udapi/block/corefud/fixentityacrossnewdoc.py b/udapi/block/corefud/fixentityacrossnewdoc.py new file mode 100644 index 00000000..61e5e4f6 --- /dev/null +++ b/udapi/block/corefud/fixentityacrossnewdoc.py @@ -0,0 +1,25 @@ +from udapi.core.block import Block +import udapi.core.coref +import logging + +class FixEntityAcrossNewdoc(Block): + """ + Fix the error reported by validate.py --coref: + "[L6 Coref entity-across-newdoc] Same entity id should not occur in multiple documents" + by making the entity IDs (eid) unique in each newdoc document. + + This block uses Udapi's support for loading GUM-like GRP document-wide IDs + (so the implementation is simple, although unnecessarily slow). + After applying this block, IDs of all entities are prefixed with document numbers, + e.g. "e45" in the 12th document changes to "d12.e45". + If you prefer simple eid, use corefud.IndexClusters afterwards. + """ + + def process_document(self, doc): + if not doc.eid_to_entity: + logging.warning(f"No entities in document {doc.meta}") + udapi.core.coref.store_coref_to_misc(doc) + assert doc.meta["global.Entity"].startswith("eid") + doc.meta["global.Entity"] = "GRP" + doc.meta["global.Entity"][3:] + udapi.core.coref.load_coref_from_misc(doc) + doc.meta["global.Entity"] = "eid" + doc.meta["global.Entity"][3:] diff --git a/udapi/block/corefud/fixinterleaved.py b/udapi/block/corefud/fixinterleaved.py new file mode 100644 index 00000000..b4a42a43 --- /dev/null +++ b/udapi/block/corefud/fixinterleaved.py @@ -0,0 +1,84 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class FixInterleaved(Block): + """Fix mentions with interleaved or crossing spans. + https://github.com/ufal/corefUD/issues/25 + """ + + def __init__(self, same_entity_only=True, both_discontinuous=False, + crossing_only=False, nested_same_subspan=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.both_discontinuous = both_discontinuous + self.crossing_only = crossing_only + self.nested_same_subspan = nested_same_subspan + + def process_tree(self, tree): + mentions, deleted = set(), set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + for mA, mB in itertools.combinations(mentions, 2): + if mA in deleted or mB in deleted: + continue + if self.same_entity_only and mA.entity != mB.entity: + continue + + # Fully nested spans are OK, except for same-subspan. + sA, sB = set(mA.words), set(mB.words) + if (sA <= sB) or (sB <= sA): + if not self.nested_same_subspan: + continue + elif not set(mA.span.split(',')).intersection(set(mB.span.split(','))): + continue + + # Crossing or interleaved+crossing? + elif self.crossing_only: + if not sA.intersection(sB): + continue + else: + if mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0]: + continue + if mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0]: + continue + + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + + mA.words = list(sA.union(sB)) + for wb in sB: + try: + wb._mentions.remove(mB) + except ValueError: + pass + try: + mB.entity.mentions.remove(mB) + except ValueError: + pass + deleted.add(mB) + + # By changing the mA.words, we could have created another error: + # making the span same as another mention. Let's fix it. + sA = set(mA.words) + for mC in sorted(mentions): + if mC in deleted or mC is mA or mC is mB: + continue + if sA != set(mC.words): + continue + # So mA and mC have the same span and we need to delete one of them to fix it. + # We will delete mA because it has the artificially enlarged span, + # while mC is from the original annotation. + for wa in sA: + try: + wa._mentions.remove(mA) + except ValueError: + pass + try: + mA.entity.mentions.remove(mA) + except ValueError: + pass + break + deleted.add(mA) diff --git a/udapi/block/corefud/fixparentheses.py b/udapi/block/corefud/fixparentheses.py new file mode 100644 index 00000000..bc8e6504 --- /dev/null +++ b/udapi/block/corefud/fixparentheses.py @@ -0,0 +1,31 @@ +from udapi.core.block import Block + + +class FixParentheses(Block): + """Find mentions that contain opening parenthesis but do not contain the closing one (or the other way around). + If the missing parenthesis is an immediate neighbour of the mention span, add it to the span.""" + + def __init__(self, mark=True, **kwargs): + super().__init__(**kwargs) + self.mark = mark + + def process_coref_mention(self, mention): + words = [word.lemma for word in mention.words] + pairs = ['()', '[]', '{}'] + for pair in pairs: + if pair[0] in words: + if not pair[1] in words and pair[1] in [node.lemma for node in mention.head.root.descendants]: + if mention.words[-1].ord == int(mention.words[-1].ord) and mention.words[-1].next_node and \ + mention.words[-1].next_node.lemma == pair[1]: + next_node = mention.words[-1].next_node + mention.words.append(next_node) + if self.mark: + next_node.misc['Mark'] = 1 + + elif pair[1] in words and pair[0] in [node.lemma for node in mention.head.root.descendants]: + if mention.words[0].ord == int(mention.words[0].ord) and mention.words[0].prev_node \ + and mention.words[0].prev_node.lemma == pair[0]: + prev_node = mention.words[0].prev_node + mention.words.append(prev_node) + if self.mark: + prev_node.misc['Mark'] = 1 diff --git a/udapi/block/corefud/fixtovalidate.py b/udapi/block/corefud/fixtovalidate.py new file mode 100644 index 00000000..48a3608d --- /dev/null +++ b/udapi/block/corefud/fixtovalidate.py @@ -0,0 +1,39 @@ +from udapi.core.block import Block + +class FixToValidate(Block): + """This block fixes the CorefUD data so that the final documents are valid conllu files.""" + + def _set_root_deprel(self, doc): + for root in doc.trees: + for node in root.children: + if node.deprel != "root": + node.deprel = "root" + + def _unset_root_deprel(self, doc): + for node in doc.nodes: + parent = node.parent + if node.deprel == "root" and parent is not None and not parent.is_root(): + #print("\t".join(['Non-0-root:', node.address(), node.upos, str(node.feats), node.parent.upos, str(node.parent.feats)])) + if parent.upos == "PUNCT" and parent.parent is not None: + node.parent = parent.parent + if node.upos == "CCONJ": + node.deprel = "cc" + elif node.upos == "ADJ" and parent.upos == "PROPN": + node.deprel = "amod" + elif node.upos == "NOUN" and parent.upos == "VERB": + node.deprel = "obl" + else: + node.deprel = "parataxis" + + def _space_before_pardoc(self, doc): + last_node = None + for i, tree in enumerate(doc.trees): + if i > 0: + if (tree.newdoc is not None or tree.newpar is not None) and last_node.no_space_after: + del last_node.misc["SpaceAfter"] + last_node = tree.descendants[-1] + + def process_document(self, doc): + self._set_root_deprel(doc) + self._unset_root_deprel(doc) + self._space_before_pardoc(doc) diff --git a/udapi/block/corefud/guessspan.py b/udapi/block/corefud/guessspan.py new file mode 100644 index 00000000..d6093ece --- /dev/null +++ b/udapi/block/corefud/guessspan.py @@ -0,0 +1,33 @@ +from udapi.core.block import Block + +class GuessSpan(Block): + """Block corefud.GuessSpan heuristically fills mention spans, while keeping mention.head""" + + def process_coref_mention(self, mention): + mwords = mention.head.descendants(add_self=True) + # TODO add heuristics from corefud.PrintMentions almost_forest=1 + + # Add empty nodes that are causing gaps. + # A node "within the span" whose enhanced parent is in the mentions + # must be added to the mention as well. + # "within the span" includes also empty nodes "on the boundary". + # However, don't add empty nodes which are in a gap cause by non-empty nodes. + to_add = [] + min_ord = int(mwords[0].ord) if mwords[0].is_empty() else mwords[0].ord - 1 + max_ord = int(mwords[-1].ord) + 1 + root = mention.head.root + for empty in root.empty_nodes: + if empty in mwords: + continue + if empty.ord > max_ord: + break + if empty.ord > min_ord: + if any(enh['parent'] in mwords for enh in empty.deps): + to_add.append(empty) + elif empty.ord > min_ord + 1 and empty.ord < max_ord - 1: + prev_nonempty = root.descendants[int(empty.ord) - 1] + next_nonempty = root.descendants[int(empty.ord)] + if prev_nonempty in mwords and next_nonempty in mwords: + to_add.append(empty) + #else: empty.misc['Mark'] = f'not_in_treelet_of_{mention.entity.eid}' + mention.words = sorted(mwords + to_add) diff --git a/udapi/block/corefud/gum2corefud.py b/udapi/block/corefud/gum2corefud.py index 95be6ce0..bf6d798d 100644 --- a/udapi/block/corefud/gum2corefud.py +++ b/udapi/block/corefud/gum2corefud.py @@ -8,92 +8,104 @@ class Gum2CorefUD(Block): def process_tree(self, tree): docname = tree.bundle.document.meta['docname'] + '_' - def entity2cluster_id(name): - return docname + name.strip('()').replace(',','').replace('+','') - - clusters = tree.bundle.document.coref_clusters + eid_to_entity = tree.bundle.document._eid_to_entity unfinished_mentions = defaultdict(list) for node in tree.descendants: - entity = node.misc['Entity'] - if not entity: + misc_entity = node.misc['Entity'] + if not misc_entity: continue - parts = [x for x in re.split('(\([^())]+\)?|[^())]+\))', entity) if x] - for part in parts: - # GUM entity name could be e.g. - # abstract-173 or place-1-Coron,_Palawan or place-77-Sub-Saharan_Africa. - # Note that the wikification part of the name may contain commas and dashes. - # Let's take the whole name as cluster_id, which will be normalized later on. - # We just need to remove commas and plus signs which are forbidden in cluster_id - # because they are used as separators in Bridging and SplitAnte, respectively. - # Let's store the type in cluster.cluster_type and Wikification in mention.misc. - name = entity2cluster_id(part) - if part[0] == '(': - cluster = clusters.get(name) - if cluster is None: - chunks = part.strip('()').split('-', maxsplit=2) - if len(chunks) == 3: - ctype, _, wiki = chunks - elif len(chunks) == 2: - ctype, _, wiki = chunks[0], None, None - else: - raise ValueError(f"Unexpected entity {part} at {node}") - cluster = node.create_coref_cluster(cluster_id=name, cluster_type=ctype) - mention = cluster.mentions[0] + # Attribute Entity may contain multiple entities, e.g. + # Entity=(abstract-7-new-2-coref(abstract-3-giv:act-1-coref) + # means a start of entity id=7 and start&end (i.e. single-word mention) of entity id=3. + # The following re.split line splits this into + # entities = ["(abstract-7-new-2-coref", "(abstract-3-giv:act-1-coref)"] + entities = [x for x in re.split('(\([^()]+\)?|[^()]+\))', misc_entity) if x] + for entity in entities: + # GUM 2.9 uses global.Entity = entity-GRP-infstat-MIN-coref_type-identity + # but the closing tag is shortent just to GRP. + opening, closing = (entity[0] == '(', entity[-1] == ')') + entity = entity.strip('()') + if not opening and not closing: + logging.warning(f"Entity {entity} at {node} has no opening nor closing bracket.") + elif not opening and closing: + name = docname + entity + if not unfinished_mentions[name]: + raise ValueError(f"Mention {name} closed at {node}, but not opened in the same tree.") + else: + mention = unfinished_mentions[name].pop() + mention.span = f'{mention.head.ord}-{node.ord}' + else: + attrs = entity.split('-') + if len(attrs) == 6: + etype, grp, infstat, minspan, ctype, wiki = attrs + elif len(attrs) == 5: + wiki = None + etype, grp, infstat, minspan, ctype = attrs + elif len(attrs) > 6: + logging.warning(f"Entity {entity} at {node} has more than 6 attributes.") + etype, grp, infstat, minspan, ctype, wiki = entity.split('-', maxsplit=5) + else: + raise ValueError(f"Less than 5 attributes in {entity} at {node}") + name = docname + grp + entity = eid_to_entity.get(name) + if entity is None: + entity = node.create_coref_entity(eid=name, etype=etype) + mention = entity.mentions[0] + mention.misc = f"Infstat:{infstat},MinSpan:{minspan},CorefType:{ctype}" if wiki: - mention.misc = 'Wikification:' + wiki.replace(',', '%2C') + mention.misc += ',Wikification:' + wiki #.replace(',', '%2C') else: - mention = cluster.create_mention(head=node) - if part[-1] == ')': + mention = entity.create_mention(head=node) + if closing: mention.words = [node] else: unfinished_mentions[name].append(mention) - elif part[-1] == ')': - if not unfinished_mentions[name]: - logging.warning(f"Mention {name} closed at {node}, but not opened in the same tree.") - else: - mention = unfinished_mentions[name].pop() - mention.span = f'{mention.head.ord}-{node.ord}' del node.misc['Entity'] - misc_bridge = node.misc['Bridge'] - if misc_bridge: - # E.g. Entity=event-23|Bridge=time-23" form, + """Re-index the coreference entity IDs (eid). The final entity IDs are of the "e" form, where are ordinal numbers starting from the one specified by the `start` parameter. This block can be applied on multiple documents within one udapy call. - For example, to re-index ClusterId in all conllu files in the current directory + For example, to re-index eid in all conllu files in the current directory (keeping the IDs unique across all the files), use: `udapy read.Conllu files='!*.conllu' corefud.IndexClusters write.Conllu overwrite=1` Parameters: ----------- start : int - the starting index (by default 1) + the starting index (default=1) + prefix : str + prefix of the IDs before the number (default="e") """ - def __init__(self, start=1): + def __init__(self, start=1, prefix='e'): self.start = start + self.prefix = prefix def process_document(self, doc): - clusters = doc.coref_clusters - if not clusters: + entities = doc.coref_entities + if not entities: return - new_clusters = {} - for idx, cid in enumerate(clusters, self.start): - cluster = clusters[cid] - new_cid = "c" + str(idx) - # need to change private variable - cluster._cluster_id = new_cid - new_clusters[new_cid] = cluster + new_eid_to_entity = {} + for idx, entity in enumerate(entities, self.start): + new_eid = self.prefix + str(idx) + entity.eid = new_eid + new_eid_to_entity[new_eid] = entity self.start = idx + 1 - doc._coref_clusters = new_clusters + doc._eid_to_entity = new_eid_to_entity diff --git a/udapi/block/corefud/link2cluster.py b/udapi/block/corefud/link2cluster.py new file mode 100644 index 00000000..08296531 --- /dev/null +++ b/udapi/block/corefud/link2cluster.py @@ -0,0 +1,137 @@ +import logging +from udapi.core.block import Block + +class Link2Cluster(Block): + """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format. + + Params: + id_attr: name of the attribute in MISC that stores the original-format IDs of nodes + ante_attr: name of the attribute in MISC that stores the ID of the antecedent + of the current node (in the same format as `id_attr`). + delete_orig_attrs: Should we delete the MISC attributes that were used for the conversion? + (i.e. id_attr and ante_attr, plus possibly also infstat_attr, coreftype_attr, + bridge_attr, bridge_relation_attr if these are used). Default=True. + infstat_attr: name of the attribute in MISC that stores the information status of a given mention + Will be stored in `mention.other['infstat']`. Use None for ignoring this. + coreftype_attr: name of the attribute in MISC that stores the coreference type of a given mention + Will be stored in `mention.other['coreftype']`. Use None for ignoring this. + bridge_attr: name of the attribute in MISC that stores the ID of the bridging antecedent + of the current node/mention (in the same format as `id_attr`). + Default=None, i.e. ignore this parameter. + bridge_relation_attr: name of the attribute in MISC that stores the bridging relation type + (e.g. "part" or "subset"). Default=None, i.e. ignore this parameter. + eid_counter: use a global counter of entity.eid and start with a given number. Default=1. + The main goal of this parameter is to make eid unique across multiple documents. + If you use eid_counter=0, this feature will be turned off, + so entities will be created using `root.document.create_coref_entity()`, + with no eid parameter, so that the eid will start from "e1" in each document processed by this block. + """ + def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True, + infstat_attr='information-status', coreftype_attr='coreftype', + bridge_attr=None, bridge_relation_attr=None, eid_counter=1, **kwargs): + super().__init__(**kwargs) + self.id_attr = id_attr + self.ante_attr = ante_attr + self.delete_orig_attrs = delete_orig_attrs + self.infstat_attr = infstat_attr + self.coreftype_attr = coreftype_attr + self.bridge_attr = bridge_attr + self.bridge_relation_attr = bridge_relation_attr + self.eid_counter = int(eid_counter) + + def _new_entity(self, doc): + if not self.eid_counter: + return doc.create_coref_entity() + entity = doc.create_coref_entity(eid=f"e{self.eid_counter}") + self.eid_counter += 1 + return entity + + def _new_mention(self, entity, node): + mention = entity.create_mention(head=node, words=[node]) + if self.infstat_attr and node.misc[self.infstat_attr]: + mention.other['infstat'] = node.misc[self.infstat_attr] + if self.delete_orig_attrs: + del node.misc[self.infstat_attr] + if self.coreftype_attr and node.misc[self.coreftype_attr]: + mention.other['coreftype'] = node.misc[self.coreftype_attr] + if self.delete_orig_attrs: + del node.misc[self.coreftype_attr] + return mention + + def process_document(self, doc): + id2node = {} + links = [] + bridges = [] + for node in doc.nodes_and_empty: + this_id = node.misc[self.id_attr] + if this_id != '': + id2node[this_id] = node + ante_id = node.misc[self.ante_attr] + if ante_id != '': + if ante_id == this_id: + logging.warning(f"{node} has a self-reference {self.ante_attr}={ante_id}") + else: + links.append([ante_id, this_id]) + if self.delete_orig_attrs: + for attr in (self.id_attr, self.ante_attr): + del node.misc[attr] + if self.bridge_attr: + bridge_id = node.misc[self.bridge_attr] + if bridge_id != '': + if bridge_id == this_id: + logging.warning(f"{node} has a self-reference bridging {self.bridge_attr}={bridge_id}") + else: + bridges.append([bridge_id, this_id, node.misc[self.bridge_relation_attr]]) + if self.delete_orig_attrs: + for attr in (self.bridge_attr, self.bridge_relation_attr): + del node.misc[attr] + + # It seems faster&simpler to process the links in any order and implement entity merging, + # rather than trying to sort the links so that no entity merging is needed. + for ante_id, this_id in links: + if ante_id not in id2node: + logging.warning(f"{ante_id} is referenced in {self.ante_attr}, but not in {self.id_attr}") + else: + ante_node, this_node = id2node[ante_id], id2node[this_id] + if not this_node.coref_mentions and not ante_node.coref_mentions: + # None of the nodes is part of any mention/entity. Let's create them. + entity = self._new_entity(this_node.root.document) + self._new_mention(entity, ante_node) + self._new_mention(entity, this_node) + elif this_node.coref_mentions and ante_node.coref_mentions: + # Both of the nodes are part of mentions in different entities. + # Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity). + # While the official API supports "stealing" a single mention (m.entity = another_entity), + # the implementation below using _mentions and _entity is a bit faster. + e_ante, e_this = this_node.coref_entities[0], ante_node.coref_entities[0] + assert e_ante != e_this + for mention in e_ante.mentions: + mention._entity = e_this + e_this._mentions.extend(e_ante.mentions) + e_this._mentions.sort() + e_ante._mentions.clear() + else: + # Only one of the nodes is part of an entity. Let's add the second one to this entity. + if ante_node.coref_mentions: + self._new_mention(ante_node.coref_entities[0], this_node) + else: + self._new_mention(this_node.coref_entities[0], ante_node) + + # Bridging + for ante_id, this_id, relation in bridges: + if ante_id not in id2node: + logging.warning(f"{ante_id} is referenced in {self.bridge_attr}, but not in {self.id_attr}") + else: + ante_node, this_node = id2node[ante_id], id2node[this_id] + if ante_node.coref_mentions: + m_ante = next(m for m in ante_node.coref_mentions if m.head is ante_node) + e_ante = m_ante.entity + else: + e_ante = self._new_entity(ante_node.root.document) + m_ante = self._new_mention(e_ante, ante_node) + if this_node.coref_mentions: + m_this = next(m for m in this_node.coref_mentions if m.head is this_node) + else: + e_this = self._new_entity(this_node.root.document) + m_this = self._new_mention(e_this, this_node) + m_this.bridging.append((e_ante, relation)) diff --git a/udapi/block/corefud/load.py b/udapi/block/corefud/load.py index 3b2534bc..92773dc2 100644 --- a/udapi/block/corefud/load.py +++ b/udapi/block/corefud/load.py @@ -8,5 +8,5 @@ def __init__(self, strict=True): self.strict = strict def process_document(self, doc): - if doc._coref_clusters is None: + if doc._eid_to_entity is None: udapi.core.coref.load_coref_from_misc(doc, self.strict) diff --git a/udapi/block/corefud/markcrossing.py b/udapi/block/corefud/markcrossing.py new file mode 100644 index 00000000..8064e67f --- /dev/null +++ b/udapi/block/corefud/markcrossing.py @@ -0,0 +1,39 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools +import logging + +class MarkCrossing(Block): + """Find mentions with crossing spans.""" + + def __init__(self, same_entity_only=False, continuous_only=False, print_form=False, + log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.continuous_only = continuous_only + self.print_form = print_form + self.log = log + self.mark = mark + self._logged = {} + + def _print(self, mention): + if self.print_form: + return ' '.join([w.form for w in mention.words]) + else: + return mention.span + + def process_node(self, node): + if len(node.coref_mentions) > 1: + for mA, mB in itertools.combinations(node.coref_mentions, 2): + if not (set(mA.words) <= set(mB.words)) and not (set(mB.words) <= set(mA.words)): + if self.same_entity_only and mA.entity != mB.entity: + continue + if self.continuous_only and (',' in mA.span or ',' in mB.span): + continue + if self.mark: + node.misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + cross_id = node.root.sent_id + mA.span + mB.span + if cross_id not in self._logged: + self._logged[cross_id] = True + print(f"crossing mentions at {node}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/markinterleaved.py b/udapi/block/corefud/markinterleaved.py new file mode 100644 index 00000000..c00f73b1 --- /dev/null +++ b/udapi/block/corefud/markinterleaved.py @@ -0,0 +1,45 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkInterleaved(Block): + """Find mentions with interleaved spans.""" + + def __init__(self, same_entity_only=False, both_discontinuous=False, print_form=False, + log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.both_discontinuous = both_discontinuous + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + if len(mentions) > 1: + for mA, mB in itertools.combinations(mentions, 2): + if set(mA.words).intersection(set(mB.words)): + continue + if mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0]: + continue + if mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0]: + continue + if self.same_entity_only and mA.entity != mB.entity: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"interleaved mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/marknested.py b/udapi/block/corefud/marknested.py new file mode 100644 index 00000000..8db8a657 --- /dev/null +++ b/udapi/block/corefud/marknested.py @@ -0,0 +1,44 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkNested(Block): + """Find nested mentions.""" + + def __init__(self, same_entity_only=True, both_discontinuous=False, multiword_only=False, + print_form=False, log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.both_discontinuous = both_discontinuous + self.multiword_only = multiword_only + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + for mA, mB in itertools.combinations(mentions, 2): + if self.same_entity_only and mA.entity != mB.entity: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + sA, sB = set(mA.words), set(mB.words) + if not (sA <= sB) and not (sB <= sA): + continue + if self.multiword_only and (len(sA) == 1 or len(sB) == 1): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"nested mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/markpairs.py b/udapi/block/corefud/markpairs.py new file mode 100644 index 00000000..cc63b387 --- /dev/null +++ b/udapi/block/corefud/markpairs.py @@ -0,0 +1,138 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools +from collections import Counter +import logging + +class MarkPairs(Block): + """Find pairs of coreference mentions within the same sentence with given properties. + Mark these pairs of mentions (using `misc["Mark"]`), so they can be further + processed or printed. + + Usage: + # Find pairs of mentions of the same entity within the same sentence: + cat my.conllu | udapy -TM corefud.MarkPairs same_entity=1 | less -R + + Properties: + same_entity - both mentions belong to the same entity (cluster) + both_continuous - both mentions have continuous spans + both_discontinuous - both mentions have discontinuous spans + nested - span of one mention is nested (a subset of) in the span of the other mention + crossing - spans are crossing (i.e. intersecting, but neither is subset of the other) + interleaved - spans are interleaved (i.e. not intersecting, but neither span precedes the other) + same_head - the same node is a head of both mentions + same_span - both mentions have the same span (which is invalid according to UD's validate.py) + same_subspan - at least one of the mentions is discontinuous and one of its subspans + is also a subspan (or span) of the other mention + + + You can combine any number of properties. + Each property can have one of the three values: + include - this is the default value: include pairs with this property, i.e. ignore the property + exclude - exclude (from the marking) pairs of mentions with this property + only - pairs of mentions without this property will be excluded + + As a shortcut, you can use -1 and 1 instead of exclude and only, so e.g. + nested=only same_head=exclude + can be written as + nested=1 same_head=-1 + """ + + def __init__(self, same_entity=0, both_continuous=0, both_discontinuous=0, + nested=0, crossing=0, interleaved=0, + same_head=0, same_span=0, same_subspan=0, + print_form=False, print_total=True, log=True, mark=True, **kwargs): + super().__init__(**kwargs) + + + self.same_entity = self._convert(same_entity) + self.both_continuous = self._convert(both_continuous) + self.both_discontinuous = self._convert(both_discontinuous) + self.nested = self._convert(nested) + self.crossing = self._convert(crossing) + self.interleaved = self._convert(interleaved) + self.same_head = self._convert(same_head) + self.same_span = self._convert(same_span) + self.same_subspan = self._convert(same_subspan) + + self.print_form = print_form + self.print_total = print_total + self.log = log + self.mark = mark + self.counter = Counter() + + def _convert(self, value): + if value in {-1, 0, 1}: + return value + if value == 'include': + return 0 + if value == 'only': + return 1 + if value == 'exclude': + return -1 + raise ValueError('unknown value ' + value) + + def _ok(self, condition, value): + if value == 0: + return True + return (condition and value == 1) or (not condition and value==-1) + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + self.counter['mentions'] += len(mentions) + + for mA, mB in itertools.combinations(mentions, 2): + self.counter['pairs'] += 1 + if not self._ok(mA.entity == mB.entity, self.same_entity): + continue + if not self._ok(mA.head == mB.head, self.same_head): + continue + + if self.both_continuous or self.both_discontinuous or self.same_span or self.same_subspan: + sA, sB = mA.span, mB.span + cA, cB = ',' not in sA, ',' not in sB + if not self._ok(cA and cB, self.both_continuous): + continue + if not self._ok(not cA and not cB, self.both_discontinuous): + continue + if not self._ok(sA == sB, self.same_span): + continue + if not self._ok(set(sA.split(',')).intersection(set(sB.split(','))), self.same_subspan): + continue + + if self.nested or self.crossing or self.interleaved: + wA, wB = set(mA.words), set(mB.words) + if not self._ok(wA <= wB or wB <= wA, self.nested): + continue + if not self._ok(wA.intersection(wB) and not wA <= wB and not wB <= wA, self.crossing): + continue + if self.interleaved: + a_precedes_b = mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0] + b_precedes_a = mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0] + if not self._ok(not wA.intersection(wB) and not a_precedes_b and not b_precedes_a, self.interleaved): + continue + + self.counter['matching'] += 1 + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + logging.info(f"Found mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") + + def after_process_document(self, doc): + if self.print_total: + #if self.max_trees and seen_trees > self.max_trees: + # print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') + msg = f'######## Mentions = {self.counter["mentions"]}, matching/all pairs = {self.counter["matching"]} / {self.counter["pairs"]}' + logging.info(msg) + doc.meta["corefud.MarkPairs"] = msg diff --git a/udapi/block/corefud/marksamesubspan.py b/udapi/block/corefud/marksamesubspan.py new file mode 100644 index 00000000..f3cfd7b3 --- /dev/null +++ b/udapi/block/corefud/marksamesubspan.py @@ -0,0 +1,45 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkSameSubSpan(Block): + """Find mentions with the same subspan.""" + + def __init__(self, same_entity_only=False, both_discontinuous=False, print_form=False, nested_only=False, + log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.both_discontinuous = both_discontinuous + self.nested_only = nested_only + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + if len(mentions) > 1: + for mA, mB in itertools.combinations(mentions, 2): + if self.same_entity_only and mA.entity != mB.entity: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + sA, sB = set(mA.words), set(mB.words) + if self.nested_only and not (sA <= sB) and not (sB <= sA): + continue + if not set(mA.span.split(',')).intersection(set(mB.span.split(','))): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"same-subspan mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/mergesamespan.py b/udapi/block/corefud/mergesamespan.py new file mode 100644 index 00000000..61b613cb --- /dev/null +++ b/udapi/block/corefud/mergesamespan.py @@ -0,0 +1,52 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools +import logging + +class MergeSameSpan(Block): + """ + Multiple same-span mentions are considered invalid in CoNLL-U, whether they + belong to the same entity or not. If they occur, merge them into one. + Note: We currently do not have mentions across sentence boundaries in the + CorefUD data, so this block processes one sentence at a time. + """ + + def __init__(self, same_entity_only=False, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + for mA, mB in itertools.combinations(mentions, 2): + if self.same_entity_only and mA.entity != mB.entity: + continue + # Reduce non-determinism in which mention is removed: + # If the mentions belong to different entities, sort them by entity (entity) ids. + if mA.entity.eid > mB.entity.eid: + mA, mB = mB, mA + + sA, sB = set(mA.words), set(mB.words) + if sA != sB: + continue + + # If the mentions belong to different entities, we should merge the + # entities first, i.e., pick one entity as the survivor, move the + # mentions from the other entity to this entity, and remove the + # other entity. + if mA.entity != mB.entity: + logging.warning(f"Merging same-span mentions that belong to different entities: {mA.entity.eid} vs. {mB.entity.eid}") + ###!!! TODO: As of now, changing the entity of a mention is not supported in the API. + #for m in mB.entity.mentions: + # m.entity = mA.entity + # Remove mention B. It may have been removed earlier because of + # another duplicate, that is the purpose of try-except. + ###!!! TODO: If we remove a singleton, we are destroying the entity. Then we must also handle possible bridging and split antecedents pointing to that entity! + mB.words = [] + try: + mB.entity.mentions.remove(mB) + except ValueError: + pass diff --git a/udapi/block/corefud/movehead.py b/udapi/block/corefud/movehead.py index e9034a22..00a32e9f 100644 --- a/udapi/block/corefud/movehead.py +++ b/udapi/block/corefud/movehead.py @@ -6,9 +6,10 @@ class MoveHead(Block): """Block corefud.MoveHead moves the head to the highest node in each mention.""" - def __init__(self, bugs='warn', **kwargs): + def __init__(self, bugs='warn', keep_head_if_possible=True, **kwargs): self.counter = Counter() self.bugs = bugs + self.keep_head_if_possible = keep_head_if_possible super().__init__(**kwargs) def _eparents(self, node): @@ -68,26 +69,24 @@ def find_head(self, mention): mention.head.misc['Bug'] = 'highest-head' # Fifth, try to convervatively preserve the original head, if it is one of the possible heads. - if mention.head in enh_heads: + if self.keep_head_if_possible and mention.head in enh_heads: return mention.head, 'nontreelet' # Finally, return the word-order-wise first head candidate as the head. return enh_heads[0], 'nontreelet' - def process_document(self, doc): - for cluster in doc.coref_clusters.values(): - for mention in cluster.mentions: - self.counter['total'] += 1 - if len(mention.words) < 2: - self.counter['single-word'] += 1 - else: - new_head, category = self.find_head(mention) - self.counter[category] += 1 - if new_head is mention.head: - self.counter[category + '-kept'] += 1 - else: - self.counter[category + '-moved'] += 1 - mention.head = new_head + def process_coref_mention(self, mention): + self.counter['total'] += 1 + if len(mention.words) < 2: + self.counter['single-word'] += 1 + else: + new_head, category = self.find_head(mention) + self.counter[category] += 1 + if new_head is mention.head: + self.counter[category + '-kept'] += 1 + else: + self.counter[category + '-moved'] += 1 + mention.head = new_head def process_end(self): logging.info("corefud.MoveHead overview of mentions:") diff --git a/udapi/block/corefud/printclusters.py b/udapi/block/corefud/printentities.py similarity index 54% rename from udapi/block/corefud/printclusters.py rename to udapi/block/corefud/printentities.py index a9a03f5e..7230c6a5 100644 --- a/udapi/block/corefud/printclusters.py +++ b/udapi/block/corefud/printentities.py @@ -3,36 +3,39 @@ from udapi.core.block import Block from collections import Counter, defaultdict -class PrintClusters(Block): - """Block corefud.PrintClusters prints all mentions of a given cluster.""" +class PrintEntities(Block): + """Block corefud.PrintEntities prints all mentions of a given entity.""" - def __init__(self, id_re=None, min_mentions=0, print_ranges=True, aggregate_mentions=True, **kwargs): + def __init__(self, eid_re=None, min_mentions=0, print_ranges=True, mark_head=True, + aggregate_mentions=True, **kwargs): """Params: - id_re: regular expression constraining ClusterId of the clusters to be printed - min_mentions: print only clusters with with at least N mentions + eid_re: regular expression constraining ID of the entities to be printed + min_mentions: print only entities with with at least N mentions print_ranges: print also addressess of all mentions (compactly, using the longest common prefix of sent_id) + mark_head: mark the head (e.g. as "red **car**") """ super().__init__(**kwargs) - self.id_re = re.compile(str(id_re)) if id_re else None + self.eid_re = re.compile(str(eid_re)) if eid_re else None self.min_mentions = min_mentions self.print_ranges = print_ranges + self.mark_head = mark_head self.aggregate_mentions = aggregate_mentions def process_document(self, doc): if 'docname' in doc.meta: - print(f"Coref clusters in document {doc.meta['docname']}:") - for cluster in doc.coref_clusters.values(): - if self.id_re and not self.id_re.match(cluster.cluster_id): + print(f"Coref entities in document {doc.meta['docname']}:") + for entity in doc.coref_entities: + if self.eid_re and not self.eid_re.match(entity.eid): continue - if len(cluster.mentions) < self.min_mentions: + if len(entity.mentions) < self.min_mentions: continue - print(f" {cluster.cluster_id} has {len(cluster.mentions)} mentions:") + print(f" {entity.eid} has {len(entity.mentions)} mentions:") if self.aggregate_mentions: counter = Counter() ranges = defaultdict(list) - for mention in cluster.mentions: - forms = ' '.join([w.form for w in mention.words]) + for mention in entity.mentions: + forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words]) counter[forms] += 1 if self.print_ranges: ranges[forms].append(mention.head.root.address() + ':' +mention.span) @@ -45,7 +48,8 @@ def process_document(self, doc): prefix = os.path.commonprefix(ranges[form]) print(f' {prefix} ({" ".join(f[len(prefix):] for f in ranges[form])})') else: - for mention in cluster.mentions: - print(' ' + ' '.join([w.form for w in mention.words])) + for mention in entity.mentions: + forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words]) + print(' ' + forms) if self.print_ranges: print(f" {mention.head.root.address()}:{mention.span}") diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py new file mode 100644 index 00000000..d011f686 --- /dev/null +++ b/udapi/block/corefud/printmentions.py @@ -0,0 +1,186 @@ +import random +from collections import Counter +from udapi.core.block import Block +from udapi.block.write.textmodetreeshtml import TextModeTreesHtml +from udapi.block.write.textmodetrees import TextModeTrees + +class PrintMentions(Block): + """Print mentions with various properties.""" + + def __init__(self, continuous='include', almost_continuous='include', treelet='include', + forest='include', almost_forest='include', oneword='include', singleton='include', + empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, + print_total=True, print_should=True, + print_sent_id=True, print_text=True, add_empty_line=True, indent=1, + minimize_cross=True, color=True, attributes='ord,form,upos,deprel,misc', + print_undef_as='_', print_doc_meta=True, print_comments=False, + mark='(Mark)', hints=True, layout='classic', + **kwargs): + super().__init__(**kwargs) + self.continuous = self._convert(continuous) + self.almost_continuous = self._convert(almost_continuous) + self.treelet = self._convert(treelet) + self.forest = self._convert(forest) + self.almost_forest = self._convert(almost_forest) + self.oneword = self._convert(oneword) + self.singleton = self._convert(singleton) + self.empty = self._convert(empty) + + self.max_trees = max_trees + self.html = html + self.shuffle = shuffle + if shuffle: + random.seed(42) + self.print_other_forms = print_other_forms + self.print_total = print_total, + self.print_should = print_should, + print_class = TextModeTreesHtml if html else TextModeTrees + self.print_block = print_class( + print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, + minimize_cross=minimize_cross, color=color, attributes=attributes, + print_undef_as=print_undef_as, print_doc_meta=print_doc_meta, print_comments=print_comments, + mark=mark, hints=hints, layout=layout) + + def _convert(self, value): + if value in {'include', 'exclude', 'only'}: + return value + if value == 1: + return 'only' + if value == 0: + return 'exclude' + raise ValueError('unknown value ' + value) + + def before_process_document(self, document): + self.print_block.before_process_document(document) + + def after_process_document(self, document): + self.print_block.after_process_document(document) + + def _ok(self, condition, value): + if value == 'include': + return True + return (condition and value == 'only') or (not condition and value=='exclude') + + def _is_auxiliary_etc(self, node): + if node.udeprel in {'case', 'cc', 'conj', 'mark', 'appos', 'vocative', 'discourse'}: + return True + if node.deprel == 'advmod:emph': + return True + if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}: + return True + return False + + def _is_forest(self, mention, mwords, almost): + for w in mention.words: + # UD unfortunatelly does not use the copula-as-head style for copula construction, + # so e.g. in "It is my fault", "fault" is the root of the tree and all other words its children. + # However, in the cop-as-head stule, only "my" would depend on "fault" (and should be part of the mention). + # It is difficult to tell apart which w.children are related to w and which to the copula. + # We thus ignore these cases completely (we expect any child is potentially related to the copula). + if any(ch.udeprel == 'cop' for ch in w.children): + continue + for ch in w.children: + if ch not in mwords: + if not almost: + if self.print_should: + ch.misc["ShouldBeInSpanOf"] = mention.entity.eid + return False + # Punctuation before or after the mention span can depend on any of the mwords + # without breaking the almost_forest property. + # According to the UD guidelines, it should depend on the highest node within the phrase, + # i.e. on the mention head, but it is not our goal now to check UD punctuation guidelines. + if ch.udeprel == 'punct' and (ch < mention.words[0] or ch > mention.words[-1]): + continue + # Some auxiliary words (e.g. prepositions) may be excluded from the mention span + # without breaking the almost_forest property, but they need to depend + # on the mention head (or if the mention is not a catena, they need to depend + # on one of the potential heads, i.e. a node from mwords whose parent is not in mwords). + # For example: "A gift for (e1 John)" is almost_forest ("for" depends on "John" which is the mention head), + # but "(e1[1/2] John) with (e1[2/2]) Mary" is not almost_forest + # because "with" depends on "Mary", which is not the mention head (nor a potential mention head). + if not (w.parent and w.parent not in mwords and self._is_auxiliary_etc(ch)): + if self.print_should: + ch.misc["ShouldBeInSpanOf"] = mention.entity.eid + return False + return True + + def _is_almost_continuous(self, mention): + if ',' not in mention.span: + return True + nonempty = [w for w in mention.words if not w.is_empty()] + if not nonempty: + return True + mwords = set(mention.words) + gap_nodes = [w for w in mention.head.root.descendants if w > nonempty[0] and w < nonempty[-1] and not w in mwords] + for gap_node in gap_nodes: + if not gap_node.is_empty(): + return False + return True + + def process_document(self, doc): + mentions = [] + for entity in doc.coref_entities: + if self._ok(len(entity.mentions) == 1, self.singleton): + mentions.extend(entity.mentions) + if self.shuffle: + random.shuffle(mentions) + else: + mentions.sort() + + seen_trees = 0 + for mention in mentions: + if not self._ok(len(mention.words) == 1, self.oneword): + continue + if not self._ok(',' not in mention.span, self.continuous): + continue + if self.almost_continuous != 'include' and not self._ok(self._is_almost_continuous(mention), self.almost_continuous): + continue + + empty_mwords = [w for w in mention.words if w.is_empty()] + if not self._ok(len(empty_mwords) > 0, self.empty): + continue + + heads, mwords = 0, set(mention.words) + for w in mention.words: + if w.parent: + heads += 0 if w.parent in mwords else 1 + else: + heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 + if not self._ok(heads <= 1, self.treelet): + continue + if self.forest != 'include' and not self._ok(self._is_forest(mention, mwords, False), self.forest): + continue + if self.almost_forest != 'include' and not self._ok(self._is_forest(mention, mwords, True), self.almost_forest): + continue + + for w in mention.words: + w.misc['Mark'] = 1 + + seen_trees += 1 + if self.max_trees and seen_trees > self.max_trees: + if not self.print_total: + print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') + return + else: + this_form = ' '.join([w.form for w in mention.words]) + print("# Mention = " + this_form) + if self.print_other_forms: + counter = Counter() + for m in mention.entity.mentions: + forms = ' '.join([w.form for w in m.words]) + if forms != this_form: + counter[forms] += 1 + if counter: + print(f"# {min(len(counter), self.print_other_forms)} other forms:", end='') + for form, count in counter.most_common(self.print_other_forms): + print(f' "{form}"({count})', end='') + print() + self.print_block.process_tree(mention.head.root) + for w in mention.words: + del w.misc['Mark'] + + if self.print_total: + if self.max_trees and seen_trees > self.max_trees: + print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') + print(f'######## Total matching/all mentions = {seen_trees} / {len(mentions)}') + diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py new file mode 100644 index 00000000..4551873c --- /dev/null +++ b/udapi/block/corefud/removenocorefentities.py @@ -0,0 +1,21 @@ +from udapi.core.block import Block +import udapi.core.coref +import re +import logging + +class RemoveNoCorefEntities(Block): + """ + Some corpora (e.g., AnCora) include annotation of named entities that are + not annotated for coreference. To distinguish them, their cluster ID starts + with 'NOCOREF' (optionally followed by entity type, so that one cluster + still has just one type). We may want to remove such entities from datasets + that are used to train coreference resolves, to prevent the resolvers from + thinking that all members of a NOCOREF cluster are coreferential. That is + what this block does. + """ + + def process_document(self, doc): + entities = doc.coref_entities + if not entities: + return + doc._eid_to_entity = {e._eid: e for e in entities if not re.match(r'^NOCOREF', e.eid)} diff --git a/udapi/block/corefud/singleparent.py b/udapi/block/corefud/singleparent.py new file mode 100644 index 00000000..ee9b1948 --- /dev/null +++ b/udapi/block/corefud/singleparent.py @@ -0,0 +1,47 @@ +"""If an empty node has multiple (enhanced-deps) parents, only the highest one is kept.""" +from udapi.core.block import Block +from collections import Counter +from udapi.core.node import find_minimal_common_treelet +import logging + +class SingleParent(Block): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._reasons = Counter() + + def process_tree(self, tree): + for empty in tree.empty_nodes: + self._reasons['_empty'] += 1 + if len(empty.deps) > 1: + self._reasons['_more-parents'] += 1 + parents = [d['parent'] for d in empty.deps] + nonempty_parents = [p for p in parents if not p.is_empty()] + if len(nonempty_parents) != len(parents): + self._reasons['empty-parent'] += 1 + #empty.misc['Mark'] = f"empty-parent:{empty.deps}" + logging.warning(f"Empty node {empty} has an empty parent.") + if not nonempty_parents: + empty.deps = [] + self._reasons['no-nonempty-parent'] += 1 + continue + (highest, added_nodes) = find_minimal_common_treelet(*nonempty_parents) + if highest in nonempty_parents: + self._reasons['one-governs'] += 1 + empty.deps = [d for d in empty.deps if d['parent'] is highest] + continue + nonempty_parents.sort(key=lambda n:n._get_attr('depth')) + if len(nonempty_parents)>1 and nonempty_parents[0]._get_attr('depth') == nonempty_parents[0]._get_attr('depth'): + self._reasons['same-depth'] += 1 + #empty.misc['Mark'] = f"same-depth:{empty.deps}" + else: + self._reasons['one-highest'] += 1 + #empty.misc['Mark'] = f"one-highest:{empty.deps}" + empty.deps = [d for d in empty.deps if d['parent'] is nonempty_parents[0]] + + def after_process_document(self, document): + message = "\n" + for k, v in self._reasons.most_common(): + message += f"{k}={v}\n" + #document.meta["bugs"] = message + logging.info(message) diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py index f07c2a27..5368cabc 100644 --- a/udapi/block/corefud/stats.py +++ b/udapi/block/corefud/stats.py @@ -4,46 +4,62 @@ class Stats(Block): """Block corefud.Stats prints various coreference-related statistics.""" - def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_clusters=True, - report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM', - exclude_singletons=False, exclude_nonsingletons=False, style='human', **kwargs): + def __init__(self, m_len_max=5, e_len_max=5, + report_basics=False, report_mentions=True, report_entities=True, + report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM _', + exclude_singletons=False, exclude_nonsingletons=False, style='human', + per_doc=False, max_rows_per_page=50, docname='newdoc', docname_len=15, + **kwargs): super().__init__(**kwargs) self.m_len_max = m_len_max - self.c_len_max = c_len_max + self.e_len_max = e_len_max + self.report_basics = report_basics self.report_mentions = report_mentions - self.report_clusters = report_clusters + self.report_entities = report_entities self.report_details = report_details self.exclude_singletons = exclude_singletons self.exclude_nonsingletons = exclude_nonsingletons self.style = style - if style not in 'tex human'.split(): - raise ValueError(f'Unknown style f{style}') + if style not in 'tex tex-table tex-doc human'.split(): + raise ValueError(f'Unknown style {style}') + self.per_doc = per_doc + self.max_rows_per_page = max_rows_per_page + if docname not in 'newdoc filename'.split(): + raise ValueError(f'Unknown style {style}') + self.docname = docname + self.docname_len = docname_len + self._header_printed = False + self._lines_printed = None self.counter = Counter() self.mentions = 0 - self.clusters = 0 + self.entities = 0 + self.singletons = 0 self.total_nodes = 0 self.longest_mention = 0 - self.longest_cluster = 0 + self.longest_entity = 0 self.m_words = 0 self.selected_upos = None if selected_upos == 'all' else selected_upos.split() def process_document(self, doc): self.total_nodes += len(list(doc.nodes)) - for cluster in doc.coref_clusters.values(): - len_mentions = len(cluster.mentions) + self.counter['documents'] += 1 + for entity in doc.coref_entities: + len_mentions = len(entity.mentions) + if len_mentions == 1: + self.singletons += 1 if len_mentions == 1 and self.exclude_singletons: continue elif len_mentions > 1 and self.exclude_nonsingletons: continue - self.longest_cluster = max(len_mentions, self.longest_cluster) + self.longest_entity = max(len_mentions, self.longest_entity) self.counter['c_total_len'] += len_mentions - self.counter[f"c_len_{min(len_mentions, self.c_len_max)}"] += 1 + self.counter[f"c_len_{min(len_mentions, self.e_len_max)}"] += 1 - self.clusters += 1 + self.entities += 1 if not self.report_mentions and not self.report_details: continue - for mention in cluster.mentions: + for mention in entity.mentions: self.mentions += 1 all_words = len(mention.words) non_empty = len([w for w in mention.words if not w.is_empty()]) @@ -66,28 +82,67 @@ def process_document(self, doc): heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 self.counter['m_nontreelet'] += 1 if heads > 1 else 0 - def process_end(self): + if self.report_basics: + for tree in doc.trees: + self.counter['newdocs'] += 1 if tree.newdoc else 0 + self.counter['sents'] += 1 + self.counter['words'] += len(tree.descendants) + self.counter['empty'] += len(tree.empty_nodes) + + def after_process_document(self, doc): + if self.per_doc: + self.process_end(skip=False, doc=doc) + self.counter = Counter() + self.mentions = 0 + self.entities = 0 + self.singletons = 0 + self.total_nodes = 0 + self.longest_mention = 0 + self.longest_entity = 0 + self.m_words = 0 + + def process_end(self, skip=True, doc=None): + if not self._lines_printed: + self.print_header() + self._lines_printed = 0 + if self.per_doc: + if skip: + self.print_footer() + return + else: + docname = doc.meta['loaded_from'] if self.docname == 'filename' else doc[0].trees[0].newdoc + print(f"{docname:{self.docname_len}}", end='&' if self.style.startswith('tex') else '\n') + elif self.style.startswith('tex-'): + print(f"{self.counter['documents']:4} documents &") + self._lines_printed += 1 + mentions_nonzero = 1 if self.mentions == 0 else self.mentions - clusters_nonzero = 1 if self.clusters == 0 else self.clusters + entities_nonzero = 1 if self.entities == 0 else self.entities total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes columns =[ ] - if self.report_clusters: - columns += [('clusters', f"{self.clusters:7,}"), - ('clusters_per1k', f"{1000 * self.clusters / total_nodes_nonzero:6.0f}"), - ('longest_cluster', f"{self.longest_cluster:6}"), - ('avg_cluster', f"{self.counter['c_total_len'] / self.clusters:5.1f}")] - for i in range(1, self.c_len_max + 1): - percent = 100 * self.counter[f"c_len_{i}"] / clusters_nonzero - columns.append((f"c_len_{i}{'' if i < self.c_len_max else '+'}", f"{percent:5.1f}")) + if self.report_basics: + columns += [('docs', f"{self.counter['newdocs']:7,}"), + ('sents', f"{self.counter['sents']:7,}"), + ('words', f"{self.counter['words']:7,}"), + ('empty', f"{self.counter['empty']:7,}"),] + if self.report_entities: + columns += [('entities', f"{self.entities:7,}"), + ('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"), + ('longest_entity', f"{self.longest_entity:6}"), + ('avg_entity', f"{self.counter['c_total_len'] / entities_nonzero:5.1f}")] + for i in range(1, self.e_len_max + 1): + percent = 100 * self.counter[f"c_len_{i}"] / entities_nonzero + columns.append((f"c_len_{i}{'' if i < self.e_len_max else '+'}", f"{percent:5.1f}")) if self.report_mentions: columns += [('mentions', f"{self.mentions:7,}"), ('mentions_per1k', f"{1000 * self.mentions / total_nodes_nonzero:6.0f}"), ('longest_mention', f"{self.longest_mention:6}"), - ('avg_mention', f"{self.counter['m_total_len'] / self.mentions:5.1f}")] - for i in range(0, self.m_len_max + 1): - percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero - columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}")) + ('avg_mention', f"{self.counter['m_total_len'] / mentions_nonzero:5.1f}")] + if self.m_len_max: + for i in range(0, self.m_len_max + 1): + percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero + columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}")) if self.report_details: columns += [('with_empty', f"{100 * self.counter['m_with_empty'] / mentions_nonzero:5.1f}"), ('with_gaps', f"{100 * self.counter['m_with_gaps'] / mentions_nonzero:5.1f}"), @@ -99,8 +154,103 @@ def process_end(self): for upos in upos_list: columns.append(('head_upos=' + upos, f"{100 * self.counter['m_head_upos_' + upos] / mentions_nonzero:5.1f}")) - if self.style == 'tex': - print(" & ".join(c[1] for c in columns)) + if self.style.startswith('tex'): + print(" & ".join(c[1] for c in columns), end=" \\\\\n") elif self.style == 'human': for c in columns: print(f"{c[0]:>15} = {c[1].strip():>10}") + if not self.per_doc: + self.print_footer() + elif self._lines_printed > self.max_rows_per_page: + self.print_footer(False) + self._lines_printed = 0 + + def print_header(self): + if not self.style.startswith('tex-'): + return + if self.style == 'tex-doc': + if self._lines_printed is None: + print(r'\documentclass[multi=mypage]{standalone}') + print(r'\usepackage[utf8]{inputenc}\usepackage{booktabs}\usepackage{underscore}') + print(r'\title{Udapi coreference statistics}') + print(r'\begin{document}') + print(r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}') + lines = [r'\begin{mypage}\begin{tabular}{@{}l ', + " " * self.docname_len, + ("document" if self.per_doc else "dataset ") + " " * (self.docname_len-8), + " " * self.docname_len] + if self.report_basics: + lines[0] += "rrrr " + lines[1] += r'& \MC{4}{total number of} ' + lines[2] += r'& & & & ' + lines[3] += r'& docs & sents & words & empty n.' + if self.report_entities: + lines[0] += "rrrr " + lines[1] += r'& \MC{4}{entities} ' + lines[2] += r'& total & per 1k & \MC{2}{length} ' + lines[3] += r'& count & words & max & avg. ' + if self.e_len_max: + for i in range(1, self.e_len_max + 1): + lines[0] += "r" + lines[2] += f"& {i:4}" + ("+ " if i==self.e_len_max else " ") + lines[3] += r'& [\%] ' + lines[0] += " " + lines[1] += r'& \MC{' + str(self.e_len_max) + r'}{distribution of entity lengths}' + if self.report_mentions: + lines[0] += "rrrr " + lines[1] += r'& \MC{4}{mentions} ' + lines[2] += r'& total & per 1k & \MC{2}{length} ' + lines[3] += r'& count & words & max & avg. ' + if self.m_len_max: + for i in range(0, self.m_len_max + 1): + lines[0] += "r" + lines[2] += f"& {i:4}" + ("+ " if i==self.m_len_max else " ") + lines[3] += r'& [\%] ' + lines[0] += " " + lines[1] += r'& \MC{' + str(self.m_len_max + 1) + r'}{distribution of mention lengths}' + " "*7 + if self.report_details: + lines[0] += "rrrr " + lines[1] += r'& \MC{3}{mention type} ' + lines[2] += r'&w/empty& w/gap&non-tree' + lines[3] += r'& [\%] ' * 3 + if self.selected_upos: + upos_list = self.selected_upos + ['other'] + else: + upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')] + lines[0] += "@{~}r" * len(upos_list) + lines[1] += r"& \MC{" + str(len(upos_list)) + r"}{distribution of head UPOS}" + lines[2] += ''.join(f'&{upos:7}' for upos in upos_list) + lines[3] += r'& [\%] ' * len(upos_list) + lines[0] += r'@{}}\toprule' + last_col = 1 + lines[1] += r'\\' + lines[2] += r'\\' + lines[3] += r'\\\midrule' + if self.report_basics: + last_col += 4 + lines[1] += r'\cmidrule(lr){2-5}' + if self.report_entities: + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+4}" + '}' + lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}' + last_col += 4 + if self.e_len_max: + last_col += self.e_len_max + lines[1] += r'\cmidrule(lr){6-' + str(last_col) + '}' + if self.report_mentions: + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+4}" + '}' + lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}' + last_col += 4 + if self.m_len_max: + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+self.m_len_max+1}" + '}' + last_col += self.m_len_max + 1 + if self.report_details: + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+3}" + lines[1] += r'}\cmidrule(l){' + f"{last_col+4}-{last_col+3+len(upos_list)}" + '}' + print("\n".join(lines)) + + def print_footer(self, end_doc=True): + if not self.style.startswith('tex-'): + return + print(r'\bottomrule\end{tabular}\end{mypage}') + if self.style == 'tex-doc' and end_doc: + print(r'\end{document}') diff --git a/udapi/block/eval/f1.py b/udapi/block/eval/f1.py index 982e4190..e4889770 100644 --- a/udapi/block/eval/f1.py +++ b/udapi/block/eval/f1.py @@ -110,8 +110,8 @@ def process_tree(self, tree): return self.visited_zones[tree.zone] += 1 - pred_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in tree.descendants] - gold_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in gold_tree.descendants] + pred_tokens = ['_'.join(n.get_attrs(self.attrs, undefs='None')) for n in tree.descendants] + gold_tokens = ['_'.join(n.get_attrs(self.attrs, undefs='None')) for n in gold_tree.descendants] # lcs("abc", "acb") can be either "ab" or "ac". # We want to prefer the LCS with the highest number of non-focused tokens. @@ -125,6 +125,9 @@ def process_tree(self, tree): nf_common = find_lcs(nf_pred_tokens, nf_gold_tokens) i, j, c, un_pred, un_gold, common = 0, 0, 0, [], [], [] while i < len(pred_tokens) and j < len(gold_tokens): + if c == len(nf_common): + common += find_lcs(pred_tokens[i:], gold_tokens[j:]) + break while nf_common[c] != pred_tokens[i]: un_pred.append(pred_tokens[i]) i += 1 @@ -135,9 +138,6 @@ def process_tree(self, tree): un_pred, un_gold = [], [] while c < len(nf_common) and nf_common[c] == pred_tokens[i] and nf_common[c] == gold_tokens[j]: i, j, c = i+1, j+1, c+1 - if c == len(nf_common): - common += find_lcs(pred_tokens[i+1:], gold_tokens[j+1:]) - break common = [x for x in common if self.focus.fullmatch(x)] pred_tokens = [x for x in pred_tokens if self.focus.fullmatch(x)] gold_tokens = [x for x in gold_tokens if self.focus.fullmatch(x)] @@ -156,6 +156,13 @@ def process_tree(self, tree): self._pred[x] += 1 self._total[x] += 1 + @property + def f1(self): + pred, gold = self.pred or 1, self.gold or 1 # prevent division by zero + precision = self.correct / pred + recall = self.correct / gold + return 2 * precision * recall / ((precision + recall) or 1) + def process_end(self): # Redirect the default filehandle to the file specified by self.files self.before_process_document(None) diff --git a/udapi/block/msf/case.py b/udapi/block/msf/case.py new file mode 100644 index 00000000..7d362c7f --- /dev/null +++ b/udapi/block/msf/case.py @@ -0,0 +1,448 @@ +""" +Morphosyntactic features (UniDive): +Derive a MS Case feature from morphological case and adposition. +""" +from udapi.core.block import Block +import logging + +class Case(Block): + + adposmap = { + 'v+Loc': 'Ine', + 'uvnitř+Gen': 'Ine', + 'uvnitř+': 'Ine', + 'mezi_uvnitř+Gen': 'Ine', # annotation error? + 'uprostřed+Gen': 'Ces', + 'mezi+Ins': 'Int', + 'mezi+Nom': 'Int', # annotation error + 'mezi+Voc': 'Int', # annotation error + 'vně+Gen': 'Ext', + 'stranou+Gen': 'Ext', + 'stranou+Dat': 'Ext', + 'na+Loc': 'Ade', + 'na_mimo+Loc': 'Ade', # annotation error? + 'na_úroveň+Gen': 'Ade', + 'na_úroveň+': 'Ade', + 'v_proces+Gen': 'Ade', # ??? + 'v_rámec+Gen': 'Ade', # ??? + 'v_rámec+': 'Ade', # ??? + 'v_řada+Gen': 'Ade', # ??? + 'z_oblast+Gen': 'Ade', # ??? + 'vedle+Gen': 'Apu', + 'u+Gen': 'Chz', + 'kolem+Gen': 'Cir', + 'kol+Gen': 'Cir', + 'dokola+Gen': 'Cir', + 'okolo+Gen': 'Cir', + 'v_oblast+Gen': 'Cir', + 'v_oblast+': 'Cir', + 'blízko+Dat': 'Prx', + 'blízko+Gen': 'Prx', + 'blízko+': 'Prx', + 'nedaleko+Gen': 'Prx', + 'daleko+Gen': 'Prx', # lemma of 'nedaleko' + 'poblíž+Gen': 'Prx', + 'daleko_od+Gen': 'Dst', + 'nad+Ins': 'Sup', + 'pod+Ins': 'Sub', + 'vespod+Gen': 'Sub', + 'před+Ins': 'Ant', + 'vpředu+Gen': 'Ant', + 'na_čelo+Gen': 'Ant', + 'v_čelo+Gen': 'Ant', + 'v_čelo+': 'Ant', + 'za+Ins': 'Pst', + 'naproti+Dat': 'Opp', + 'od+Gen': 'Abl', + 'od+Dat': 'Abl', # annotation error + 'směr_od+Gen': 'Abl', + 'z_strana+Gen': 'Abl', + 'z_strana+': 'Abl', + 'z+Gen': 'Ela', + 'z+Nom': 'Ela', # annotation error + 'z+Dat': 'Ela', # annotation error + 'zevnitř+Gen': 'Ela', + 'zprostřed+Gen': 'Cne', + 's+Gen': 'Del', + 'zpod+Gen': 'Sbe', + 'zpoza+Gen': 'Pse', + 'po+Loc': 'Per', + 'cesta+Gen': 'Per', + 'cesta+Ins': 'Per', + 'napříč+Gen': 'Crs', + 'napříč+Ins': 'Crs', + 'podél+Gen': 'Lng', + 'skrz+Acc': 'Inx', + 'přes+Acc': 'Spx', + 'přes+Nom': 'Spx', # annotation error + 'ob+Acc': 'Cix', + 'po+Acc': 'Ter', + 'po+Nom': 'Ter', # annotation error + 'po+Gen': 'Ter', # annotation error + 'do+Gen': 'Ill', + 'do+Acc': 'Ill', # annotation error + 'do_/+Gen': 'Ill', + 'dovnitř+Gen': 'Ill', + 'doprostřed+Gen': 'Cnl', + 'mezi+Acc': 'Itl', + 'na+Acc': 'All', + 'na+Nom': 'All', # annotation error + 'na+Gen': 'All', # annotation error + 'k+Dat': 'Apl', + 'k+Nom': 'Apl', # annotation error + 'vstříc+Dat': 'Apl', + 'do_oblast+Gen': 'Apl', + 'směr+': 'Apl', + 'směr_k+Dat': 'Apl', + 'směr_k+': 'Apl', + 'směr_na+Acc': 'Apl', + 'v_směr_k+Dat': 'Apl', + 'nad+Acc': 'Spl', + 'nad+Nom': 'Spl', # annotation error + 'pod+Acc': 'Sbl', + 'před+Acc': 'Anl', + 'před+Gen': 'Anl', # annotation error + 'za+Acc': 'Psl', + 'dík_za+Acc': 'Psl', # annotation error? + 'dokud': 'Tan', + 'nežli': 'Tan', + 'v+Acc': 'Tem', + 'v+Nom': 'Tem', # annotation error + 'v+Gen': 'Tem', # annotation error + 'při_příležitost+Gen': 'Tem', + 'současně_s+Ins': 'Tem', + 'u_příležitost+Gen': 'Tem', + 'v_období+Gen': 'Tpx', + 'počátkem+Gen': 'Din', + 'počátek+Gen': 'Din', + 'počínat+Ins': 'Din', + 'počínat+': 'Din', + 'začátkem+Gen': 'Din', + 'začátek+Gen': 'Din', + 'během+Gen': 'Dur', + 'postupem+Gen': 'Dur', + 'postup+Gen': 'Dur', + 'při+Loc': 'Dur', + 'v_průběh+Gen': 'Dur', + 'za+Gen': 'Der', + 'koncem+Gen': 'Dtr', + 'konec+Gen': 'Dtr', + 'k_konec+Gen': 'Dtr', + 'končit+Ins': 'Dtr', + 'závěrem+Gen': 'Dtr', + 'závěr+Gen': 'Dtr', + 'na_závěr+Gen': 'Dtr', + 'v_závěr+Gen': 'Dtr', + 'jakmile': 'Tps', + 'jen_co': 'Tps', + 'před_po+Loc': 'Tps', + 'počínaje+Ins': 'Teg', + 'jménem+Nom': 'Atr', + 'jméno+Nom': 'Atr', + 'zdali': 'Atr', + 'že': 'Atr', + 'z_řada+Gen': 'Gen', + 's+Ins': 'Com', + 's+Nom': 'Com', # annotation error + 'spolu_s+Ins': 'Com', + 'spolu_s+': 'Com', + 'společně_s+Ins': 'Com', + 'společně_s+': 'Com', + 'v_čelo_s+Ins': 'Com', + 'v_spolupráce_s+Ins': 'Com', + 'bez+Gen': 'Abe', + 'včetně+Gen': 'Inc', + 'nad_rámec+Gen': 'Add', + 'kromě+Gen': 'Exc', + 'krom+Gen': 'Exc', + 'mimo+Acc': 'Exc', + 'mimo+Gen': 'Exc', + 'vyjma+Gen': 'Exc', + 'až_na+Acc': 'Exc', + 's_výjimka+Gen': 'Exc', + 's_výjimka+': 'Exc', + 'místo+Gen': 'Sbs', + 'místo+Ins': 'Sbs', # něčím místo něčím jiným + 'místo+Loc': 'Sbs', # annotation error + 'místo_do+Gen': 'Sbs', + 'místo_k+Dat': 'Sbs', + 'místo_na+Acc': 'Sbs', + 'místo_na+': 'Sbs', + 'místo_po+Loc': 'Sbs', + 'místo_v+Acc': 'Sbs', + 'místo_v+': 'Sbs', + 'místo_za+Acc': 'Sbs', + 'namísto+Gen': 'Sbs', + 'namísto_do+Gen': 'Sbs', + 'v_zastoupení+Gen': 'Sbs', + 'výměna_za+Acc': 'Sbs', + 'jako': 'Ess', + 'jako+': 'Ess', + 'jako+Nom': 'Ess', + 'jako+Acc': 'Ess', + 'jako+Dat': 'Ess', + 'jako_u+Gen': 'Ess', + 'jako_v+Loc': 'Ess', + 'formou+Gen': 'Ess', + 'forma+Gen': 'Ess', + 'v_forma+Gen': 'Ess', + 'v_podoba+Gen': 'Ess', + 'v_podoba+': 'Ess', + 'shoda+Gen': 'Equ', + 'v_shoda_s+Ins': 'Equ', + 'do_soulad_s+Ins': 'Sem', + 'na_způsob+Gen': 'Sem', + 'po_vzor+Gen': 'Sem', + 'úměrně+Dat': 'Sem', + 'úměrně_k+Dat': 'Sem', + 'úměrně_s+Ins': 'Sem', + 'v_analogie_s+Ins': 'Sem', + 'v_duch+Gen': 'Sem', + 'v_smysl+Gen': 'Sem', + 'oproti+Dat': 'Dsm', + 'na_rozdíl_od+Gen': 'Dsm', + 'na_rozdíl_od+': 'Dsm', + 'než': 'Cmp', + 'než+Nom': 'Cmp', + 'než+Gen': 'Cmp', + 'než+Acc': 'Cmp', + 'než_nad+Ins': 'Cmp', + 'než_v+Acc': 'Cmp', + 'než_v+Loc': 'Cmp', + 'v_poměr_k+Dat': 'Cmp', + 'v_poměr_k+': 'Cmp', + 'v_porovnání_k+Dat': 'Cmp', + 'v_porovnání_s+Ins': 'Cmp', + 'v_porovnání_s+': 'Cmp', + 'v_srovnání_s+Ins': 'Cmp', + 'v_srovnání_s+': 'Cmp', + 'o+Acc': 'Dif', + 'o+Nom': 'Dif', # annotation error + 'o+Gen': 'Dif', # annotation error + 'o+Dat': 'Dif', # annotation error + 'o_o+Acc': 'Dif', # annotation error + 'kdežto': 'Cmt', + 'přičemž': 'Cmt', + 'zatímco': 'Cmt', + 'díky+Dat': 'Cau', + 'dík+Dat': 'Cau', + 'kvůli+Dat': 'Cau', + 'vinou+Gen': 'Cau', + 'vlivem+Gen': 'Cau', + 'vliv+Gen': 'Cau', + 'vliv+': 'Cau', + 'vinou+Gen': 'Cau', + 'vina+Gen': 'Cau', + 'zásluhou+Gen': 'Cau', + 'zásluha+Gen': 'Cau', + 'z_důvod+Gen': 'Cau', + 'v_důsledek+Gen': 'Cau', + 'jelikož': 'Cau', + 'ježto': 'Cau', + 'poněvadž': 'Cau', + 'protože': 'Cau', + 'takže': 'Cau', + 'následek+Gen': 'Cau', + 'aby': 'Pur', + 'jméno+Gen': 'Pur', + 'pro_případ+Gen': 'Pur', + 'v_jméno+Gen': 'Pur', + 'v_zájem+Gen': 'Pur', + 'za_účel+Gen': 'Pur', + 'na_základ+Gen': 'Cns', + 'pod_vliv+Gen': 'Cns', + 's_ohled_na+Acc': 'Cns', + 's_přihlédnutí_k+Dat': 'Cns', + 's_přihlédnutí_na+Acc': 'Cns', + 'v_souvislost_s+Ins': 'Cns', + 'v_souvislost_s+': 'Cns', + 'v_světlo+Gen': 'Cns', + 'vzhledem_k+Dat': 'Cns', + 'v_soulad_s+Ins': 'Cns', + 'v_soulad_s+': 'Cns', + 'z_titul+Gen': 'Cns', + 'ať': 'Ign', + 'bez_ohled_na+Acc': 'Ign', + 'nehledě_k+Dat': 'Ign', + 'nehledě_na+Acc': 'Ign', + 'navzdory+Dat': 'Ccs', + 'vzdor+Dat': 'Ccs', + 'v_rozpor_s+Ins': 'Ccs', + 'ač': 'Ccs', + 'ačkoli': 'Ccs', + 'byť': 'Ccs', + 'přestože': 'Ccs', + 'třebaže': 'Ccs', + 'jestli': 'Cnd', + 'jestliže': 'Cnd', + 'ledaže': 'Cnd', + 'li': 'Cnd', + 'pakliže': 'Cnd', + 'pokud': 'Cnd', + 'pokud+Nom': 'Cnd', + 'zda': 'Cnd', + 'v_případ+Gen': 'Cnd', + 'v_případ+': 'Cnd', + 'v_závislost_na+Loc': 'Cnd', + 'v_závislost_s+Ins': 'Cnd', + 'o+Loc': 'The', + 'ohledně+Gen': 'The', + 'stran+Gen': 'The', + 'co_do+Gen': 'The', + 'na_téma+Gen': 'The', + 'na_téma+Nom': 'The', + 'na_téma+': 'The', + 'na_úsek+Gen': 'The', + 'po_stránka+Gen': 'The', + 'v_obor+Gen': 'The', + 'v_otázka+Gen': 'The', + 'v_spojení_s+Ins': 'The', + 'v_věc+Gen': 'The', + 'v_vztah_k+Dat': 'The', + 'v_vztah_k+': 'The', + 'v_záležitost+Gen': 'The', + 'v_znamení+Gen': 'The', + 'z_hledisko+Gen': 'The', + 'z_hledisko+': 'The', + 'podle+Gen': 'Quo', + 'dle+Gen': 'Quo', + 'pomocí+Gen': 'Ins', + 's_pomoc+Gen': 'Ins', + 'prostřednictvím+Gen': 'Ins', + 'prostřednictví+Gen': 'Ins', + 'prostřednictví+Ins': 'Ins', # annotation error + 'prostřednictví+': 'Ins', + 'za_pomoc+Gen': 'Ins', + 'pro+Acc': 'Ben', + 'pro+Nom': 'Ben', # annotation error + 'pro+Gen': 'Ben', # annotation error + 'pro+Ins': 'Ben', # annotation error + 'napospas+Dat': 'Ben', + 'k_prospěch+Gen': 'Ben', + 'na_úkor+Gen': 'Ben', + 'na_vrub+Gen': 'Ben', + 'v_prospěch+Gen': 'Ben', + 'v_neprospěch+Gen': 'Ben', + 'v_služba+Gen': 'Ben', + 'proti+Dat': 'Adv', + 'proti+Gen': 'Adv', + 'kontra+Nom': 'Adv', + 'versus+Nom': 'Adv', + 'vůči+Dat': 'Adv', + # subordinators + 'dokud': 'Tan', + 'nežli': 'Tan', + 'jakmile': 'Tps', + 'jen_co': 'Tps', + 'zdali': 'Atr', + 'že': 'Atr', + 'jako': 'Ess', + 'než': 'Cmp', + 'kdežto': 'Cmt', + 'přičemž': 'Cmt', + 'zatímco': 'Cmt', + 'jelikož': 'Cau', + 'ježto': 'Cau', + 'poněvadž': 'Cau', + 'protože': 'Cau', + 'takže': 'Cau', + 'aby': 'Pur', + 'ať': 'Ign', + 'ač': 'Ccs', + 'ačkoli': 'Ccs', + 'byť': 'Ccs', + 'přestože': 'Ccs', + 'třebaže': 'Ccs', + 'jestli': 'Cnd', + 'jestliže': 'Cnd', + 'ledaže': 'Cnd', + 'li': 'Cnd', + 'pakliže': 'Cnd', + 'pokud': 'Cnd', + 'zda': 'Cnd', + # coordinators + 'a': 'Conj', + 'i': 'Conj', + 'ani': 'Nnor', + 'nebo': 'Disj', + 'či': 'Disj', + 'ale': 'Advs', + 'avšak': 'Advs', + 'však': 'Advs', + 'nýbrž': 'Advs', + 'neboť': 'Reas', + 'tedy': 'Cnsq', + 'tak': 'Cnsq' + } + + def process_node(self, node): + """ + Derives a case value from preposition and morphological case. Stores it + as MSFCase in MISC. + """ + # Do not do anything for function words. + # Specifically for Case, also skip 'det' and 'amod' modifiers (congruent attributes) + # because their Case is only agreement feature inherited from the head noun. + if node.udeprel in ['case', 'mark', 'cc', 'aux', 'cop', 'punct']: + node.misc['MSFFunc'] = 'Yes' + return + elif node.udeprel in ['det', 'amod']: + node.misc['MSFFunc'] = 'No' + return + else: + node.misc['MSFFunc'] = 'No' + # Get all case markers (adpositions) attached to the current node. + adpositions = [] + for c in node.children: + if c.udeprel == 'case': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + adpositions.append(lemma) + # We assume that all features were copied from FEATS to MISC in mwe.MsfInit. + # They may have been further processed there, so we take the input from there. + msfcase = node.misc['MSFCase'] + if adpositions: + adpostring = '_'.join(adpositions) + caseadpostring = adpostring + '+' + msfcase + if caseadpostring in self.adposmap: + msfcase = self.adposmap[caseadpostring] + else: + logging.warn(f"No Case value found for '{caseadpostring}'.") + msfcase = caseadpostring + # Omer wants to collect cases from both adpositions and subordinators + # but we will consider subordinators only if we do not have any case + # from morphology or adpositions. + if not msfcase: + subordinators = [] + for c in node.children: + if c.udeprel == 'mark': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + subordinators.append(lemma) + if subordinators: + subordstring = '_'.join(subordinators) + if subordstring in self.adposmap: + msfcase = self.adposmap[subordstring] + # To lump coordinators with all the above makes even less sense but for + # the moment we do it. + if not msfcase: + coordinators = [] + for c in node.children: + if c.udeprel == 'cc': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + coordinators.append(lemma) + if coordinators: + coordstring = '_'.join(coordinators) + if coordstring in self.adposmap: + msfcase = self.adposmap[coordstring] + node.misc['MSFCase'] = msfcase diff --git a/udapi/block/msf/createabstract.py b/udapi/block/msf/createabstract.py new file mode 100644 index 00000000..fbdf73e5 --- /dev/null +++ b/udapi/block/msf/createabstract.py @@ -0,0 +1,45 @@ +""" +Morphosyntactic features (UniDive): +Create abstract nodes representing dropped arguments of predicates (if verbal +morphology signals that the subject is third person singular, and there is no +subject node, create an abstract node and copy the features there). +""" +from udapi.core.block import Block +import re + +class CreateAbstract(Block): + + def process_node(self, node): + """ + If a node has MSFVerbForm=Fin and at least one of the agreement features + MSFNumber, MSFPerson, MSFGender, MSFAnimacy, MSFPolite, assume that these + features characterize the subject (this block is not suitable for languages + with polypersonal agreement). Check that the subject is present. If not, + create an abstract node to represent it. + """ + if node.misc['MSFVerbForm'] == 'Fin' and any([node.misc[x] for x in ['MSFNumber', 'MSFPerson', 'MSFGender', 'MSFAnimacy', 'MSFPolite']]): + # Current node is a finite predicate. Does it have a subject? If not, create an abstract one. + if not any([x.udeprel in ['nsubj', 'csubj'] for x in node.children]): + # There could already be an abstract subject. We have to look for it in the enhanced graph. + if not any([re.match(r"^[nc]subj", edep['deprel']) for edep in node.deps]): + # Create an abstract subject. + subject = node.create_empty_child('nsubj') + subject.upos = 'PRON' + subject.feats['PronType'] = 'Prs' + subject.misc['MSFPronType'] = 'Prs' + subject.feats['Case'] = 'Nom' + subject.misc['MSFCase'] = 'Nom' + for f in ['Number', 'Person', 'Gender', 'Animacy', 'Polite']: + msf = 'MSF' + f + if node.misc[msf]: + subject.feats[f] = node.misc[msf] + subject.misc[msf] = node.misc[msf] + subject.misc['MSFFunc'] = 'No' + # Regardless of whether it had a subject or not, the agreement features + # should be removed from the verb. + ###!!! We also may want to check if the pre-existing subject has all the features. + node.misc['MSFNumber'] = '' + node.misc['MSFPerson'] = '' + node.misc['MSFGender'] = '' + node.misc['MSFAnimacy'] = '' + node.misc['MSFPolite'] = '' diff --git a/udapi/block/msf/init.py b/udapi/block/msf/init.py new file mode 100644 index 00000000..ceca12af --- /dev/null +++ b/udapi/block/msf/init.py @@ -0,0 +1,53 @@ +""" +Morphosyntactic features (UniDive): +Initialization. Copies features from FEATS as MSF* attributes to MISC. +""" +from udapi.core.block import Block +import re + +class Init(Block): + + + def process_node(self, node): + """ + For every feature in FEATS, creates its MSF* counterpart in MISC. + """ + for f in node.feats: + # Only selected features will be copied. Certain features are not + # interesting for the morphosyntactic annotation. + if f not in ['Abbr', 'AdpType', 'Emph', 'Foreign', 'NameType', 'Style', 'Typo', 'Variant']: + node.misc['MSF'+f] = node.feats[f] + # We are particularly interested in the Case feature but some nominals + # lack it (e.g. acronyms or numbers). If there is a preposition, it may + # indicate the expected case of the nominal. + if not node.feats['Case']: + # Not any 'case' dependent is helpful. Here we really need single-word + # adposition. + adpositions = [x for x in node.children if x.udeprel == 'case' and x.upos == 'ADP'] + if len(adpositions) == 1: + fixed = [x for x in adpositions[0].children if x.udeprel == 'fixed'] + if not fixed and adpositions[0].feats['Case']: + node.misc['MSFCase'] = adpositions[0].feats['Case'] + # If we did not find a preposition to help us, we may be able to read + # the case off an adjectival modifier or determiner. + if not node.misc['MSFCase']: + modifiers = [x for x in node.children if x.udeprel in ['amod', 'det'] and x.feats['Case']] + if modifiers: + node.misc['MSFCase'] = modifiers[0].feats['Case'] + # Finally, if the above did not help, we may guess the case from the deprel of the node itself. + if not node.misc['MSFCase']: + if node.udeprel == 'nsubj': + node.misc['MSFCase'] = 'Nom' + elif node.udeprel == 'obj': + node.misc['MSFCase'] = 'Acc' + # If the node contains Phrase features in MISC (periphrastic verb forms + # detected by Lenka's code), replace the MS features with them. + phrasefeatures = [x for x in node.misc if re.match(r"^Phrase[A-Z]", x)] + for pf in phrasefeatures: + msf = pf + if msf == 'PhraseForm': + msf = 'MSFVerbForm' + else: + msf = re.sub(r"Phrase", 'MSF', pf) + node.misc[msf] = node.misc[pf] + node.misc[pf] = '' diff --git a/udapi/block/msf/numphrase.py b/udapi/block/msf/numphrase.py new file mode 100644 index 00000000..22f68c9d --- /dev/null +++ b/udapi/block/msf/numphrase.py @@ -0,0 +1,36 @@ +""" +Morphosyntactic features (UniDive): +Case in Number Phrases like 'pět mužů' (five men) in Czech. +""" +from udapi.core.block import Block + +class NumPhrase(Block): + + + def process_node(self, node): + """ + Nouns with a 'nummod:gov' dependent are morphologically in genitive, + but the case of the whole phrase (number + counted noun) is different, + probably nominative or accusative. + """ + quantifiers = [x for x in node.children if x.deprel in ['nummod:gov', 'det:numgov']] + current_case = node.misc['MSFCase'] + if (current_case == 'Gen' or current_case == '') and quantifiers: + quantifier_case = quantifiers[0].misc['MSFCase'] + # The quantifier may lack the case feature (e.g. numbers expressed by digits) + # but we may be able to guess it from a preposition or other factors. + if quantifier_case == '': + # Not any 'case' dependent is helpful. Here we really need single-word + # adposition. + adpositions = [x for x in node.children if x.udeprel == 'case' and x.upos == 'ADP'] + if len(adpositions) == 1: + fixed = [x for x in adpositions[0].children if x.udeprel == 'fixed'] + if not fixed and adpositions[0].feats['Case']: + quantifier_case = adpositions[0].feats['Case'] + # Finally, if the above did not help, we may guess the case from the deprel of the node itself. + if quantifier_case == '': + if node.udeprel == 'nsubj': + quantifier_case = 'Nom' + elif node.udeprel == 'obj': + quantifier_case = 'Acc' + node.misc['MSFCase'] = quantifier_case diff --git a/udapi/block/msf/phrase.py b/udapi/block/msf/phrase.py new file mode 100644 index 00000000..90ea5d2d --- /dev/null +++ b/udapi/block/msf/phrase.py @@ -0,0 +1,139 @@ +""" +Morphosyntactic features (UniDive): +An abstract block as a base for derivation of blocks that discover periphrastic +verb forms and save them as Phrase features in MISC. This block provides the +methods that save the features in MISC. It is based on the Writer module by +Lenka Krippnerová. +""" +from udapi.core.block import Block +import logging + +class Phrase(Block): + + def process_node(self, node): + """ + Override this in a derived class! + """ + logging.fatal('process_node() not implemented.') + + dictionary = { + 'person': 'PhrasePerson', + 'number': 'PhraseNumber', + 'mood': 'PhraseMood', + 'tense': 'PhraseTense', + 'voice': 'PhraseVoice', + 'aspect':'PhraseAspect', + 'form': 'PhraseForm', + 'reflex': 'PhraseReflex', + 'polarity': 'PhrasePolarity', + 'gender':'PhraseGender', + 'animacy':'PhraseAnimacy', + 'ords':'Phrase', + 'expl':'PhraseExpl', + } + + # a dictionary where the key is the lemma of a negative particle and the value is a list of the lemmas of their possible children that have a 'fixed' relation + # we do not want to include these negative particles in the phrase; these are expressions like "never", etc. + negation_fixed = { + # Belarusian + 'ні' : ['раз'], + 'ня' : ['толькі'], + + # Upper Sorbian + 'nic' : ['naposledku'], + + # Polish + 'nie' : ['mało'], + + # Pomak + 'néma' : ['kak'], + + # Slovenian + 'ne' : ['le'], + + # Russian and Old East Slavic + 'не' : ['то', 'токмо'], + 'ни' : ['в', 'раз', 'шатко'], + 'нет' : ['нет'] + } + + def write_node_info(self, node, + tense = None, + person = None, + number = None, + mood = None, + voice = None, + form = None, + reflex = None, + polarity = None, + ords = None, + gender = None, + animacy = None, + aspect = None, + expl=None): + arguments = locals() + del arguments['self'] # delete self and node from arguments, + del arguments['node'] # we want only grammatical categories + for key,val in arguments.items(): + if val != None: + node.misc[self.dictionary[key]] = val + + def has_fixed_children(self, node): + """ + Returns True if the node has any children with the 'fixed' relation and the node's lemma along with the child's lemma are listed in self.negation_fixed. + """ + fixed_children = [x for x in node.children if x.udeprel == 'fixed'] + + if fixed_children: + if fixed_children[0].lemma in self.negation_fixed.get(node.lemma, []): + return True + return False + + def get_polarity(self, nodes): + """ + Returns 'Neg' if there is exactly one node with Polarity='Neg' among the given nodes. + Returns an empty string if there are zero or more than one such nodes. + """ + neg_count = 0 + for node in nodes: + if node.feats['Polarity'] == 'Neg': + neg_count += 1 + + if neg_count == 1: + return 'Neg' + + # neg_count can be zero or two, in either case we want to return an empty string so that the PhrasePolarity attribute is not generated + else: + return '' + + def get_negative_particles(self, nodes): + """ + Returns a list of all negative particles found among the children + of the specified nodes, except for negative particles with fixed children specified in self.negation_fixed. + """ + neg_particles = [] + for node in nodes: + neg = [x for x in node.children if x.upos == 'PART' and x.feats['Polarity'] == 'Neg' and x.udeprel == 'advmod' and not self.has_fixed_children(x)] + if neg: + neg_particles += neg + return neg_particles + + + def get_is_reflex(self,node,refl): + if node.feats['Voice'] == 'Mid': + return 'Yes' + if len(refl) == 0: + return node.feats['Reflex'] + return 'Yes' + + def is_expl_pass(self,refl): + if len(refl) == 0: + return False + return refl[0].deprel == 'expl:pass' + + def get_voice(self,node,refl): + voice = node.feats['Voice'] + if self.is_expl_pass(refl): + return 'Pass' + return voice + diff --git a/udapi/block/msf/removefunc.py b/udapi/block/msf/removefunc.py new file mode 100644 index 00000000..e169a2de --- /dev/null +++ b/udapi/block/msf/removefunc.py @@ -0,0 +1,17 @@ +""" +Morphosyntactic features (UniDive): +Cleanup. Removes MSF* features from MISC for function nodes (MSFFunc=Yes). +""" +from udapi.core.block import Block + +class RemoveFunc(Block): + + + def process_node(self, node): + """ + Removes MSF* features if MSFFunc=Yes. + """ + if node.misc['MSFFunc'] == 'Yes': + msfeats = [x for x in node.misc if x.startswith('MSF')] + for msf in msfeats: + node.misc[msf] = '' diff --git a/udapi/block/msf/romance/romance.py b/udapi/block/msf/romance/romance.py new file mode 100644 index 00000000..dd2393f7 --- /dev/null +++ b/udapi/block/msf/romance/romance.py @@ -0,0 +1,523 @@ + +import udapi.block.msf.phrase +from enum import Enum + +class Aspect(str, Enum): + IMP = 'Imp' + IMPPROG = 'ImpProg' + PERF = 'Perf' + PERFPROG = 'PerfProg' + PROG = 'Prog' + PQP = 'Pqp' + +class Tense(str, Enum): + FUT = 'Fut' + FUTFUT = 'FutFut' + PAST = 'Past' + PASTFUT = 'PastFut' + PASTPRES = 'PastPres' + PRES = 'Pres' + +class Romance(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + + cop = [x for x in node.children if x.udeprel == 'cop'] + + # only expl or expl:pv, no expl:impers or expl:pass + refl = [x for x in node.children if x.lemma == 'se' and x.upos == 'PRON' and x.udeprel == 'expl' and x.udeprel != 'expl:impers' and x.udeprel != 'expl:pass'] + + if refl: + expl='Pv' + else: + expl=None + + if cop: + auxes = [x for x in node.children if x.udeprel == 'aux'] + if auxes: + self.process_periphrastic_verb_forms(cop[0], auxes, refl, auxes + cop, node) + else: + # no auxiliaries, only cop + self.process_copulas(node,cop,auxes,refl,expl) + return + + if node.upos == 'VERB': + auxes = [x for x in node.children if x.udeprel == 'aux'] + aux_pass = [x for x in node.children if x.deprel == 'aux:pass'] + auxes_without_pass = [x for x in node.children if x.udeprel == 'aux' and x.deprel != 'aux:pass'] + + # infinitive with a subject is a subjunctive + subj = [x for x in node.children if x.udeprel == 'subj'] + if node.feats['VerbForm'] == 'Inf' and subj: + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + mood='Sub', + form='Fin', + tense=Tense.FUT.value, + gender=node.feats['Gender'], + voice=node.feats['Voice'], + expl=expl, + ords=[node.ord] + ) + return + + if not auxes: + phrase_ords = [node.ord] + [r.ord for r in refl] + phrase_ords.sort() + + # presente -> PhraseTense=Pres, PhraseAspect='' + # Futuro do presente -> PhraseTense=Fut, PhraseAspect='' + aspect = '' + tense = node.feats['Tense'] + + if node.feats['Mood'] == 'Ind': + + # pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp + if node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMP.value + + # pretérito perfeito -> PhraseTense=Past, PhraseAspect=Perf + if node.feats['Tense'] == 'Past': + aspect=Aspect.PERF.value + + # pretérito mais que perfeito simples -> PhraseTense=Past, PhraseAspect=Pqp + if node.feats['Tense'] == 'Pqp': + tense=Tense.PAST.value + aspect=Aspect.PQP.value + + # subjunctive presente -> PhraseTense=Pres, PhraseAspect='' + # subjunctive futuro -> PhraseTense=Fut, PhraseAspect='' + if node.feats['Mood'] == 'Sub': + + if node.feats['Tense'] == 'Past': + aspect=Aspect.IMP.value + + # subjunctive pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp + if node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMP.value + + # Futuro do pretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd + if node.feats['Mood'] == 'Cnd': + aspect='' + tense=Tense.PRES.value + + + self.write_node_info(node, + person=node.feats['Person'], + aspect=aspect, + number=node.feats['Number'], + mood=node.feats['Mood'], + form=node.feats['VerbForm'], + tense=tense, + gender=node.feats['Gender'], + voice=node.feats['Voice'], + expl=expl, + ords=phrase_ords + ) + + + else: + # no passive auxiliaries + if not aux_pass: + self.process_periphrastic_verb_forms(node, auxes, refl, auxes, node) + + # head verb has one passive auxiliary and no more other auxiliaries + # TODO complete the tenses and aspects for individual verb forms + elif not auxes_without_pass: + phrase_ords = [node.ord] + [x.ord for x in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + self.write_node_info(node, + person=aux_pass[0].feats['Person'], + number=aux_pass[0].feats['Number'], + mood=aux_pass[0].feats['Mood'], + form='Fin', + tense=aux_pass[0].feats['Tense'], + gender=node.feats['Gender'], + voice='Pass', + expl=expl, + ords=phrase_ords + ) + + # head verb has passive auxiliary and also other auxiliaries + else: + self.process_periphrastic_verb_forms(aux_pass[0], auxes_without_pass, refl, auxes, node) + + + def process_periphrastic_verb_forms(self, node, auxes, refl, all_auxes, head_node): + """ + Parameters + - node: if there is no passive then the node is the head verb, if the head verb is in the passive, then the node is the passive auxiliary + - auxes: list of all auxiliaries except the passive auxes + - refl: list of reflexives which should be included into the periphrastic phrase + - all_auxes: list of all auxiliaries (passive auxes are included) + - head_node: the node which should have the Phrase* attributes, i. e. the head of the phrase + + annotates periphrastic verb forms with the Phrase* attributes + """ + + if refl: + expl='Pv' + else: + expl=None + + if len(auxes) == 1: + # Cnd + if ((auxes[0].lemma == 'ter' and node.feats['VerbForm'] == 'Part') or (auxes[0].lemma == 'estar' and node.feats['VerbForm'] == 'Ger')) and auxes[0].feats['Mood'] == 'Cnd': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + [r.ord for r in refl] + phrase_ords.sort() + + # aux estar cond + gerund -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Cnd + if auxes[0].lemma == 'estar': + tense=Tense.PRES.value + aspect=Aspect.PROG.value + + # Futuro do pretérito composto -> PhraseTense=Past, PhraseAspect=Perf, PhraseMood=Cnd + else: + tense=Tense.PAST.value + aspect=Aspect.PERF.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + aspect=aspect, + mood='Cnd', + form='Fin', + expl=expl, + voice=head_node.feats['Voice'], + ords=phrase_ords) + return + + # Auxiliary 'estar' followed by a gerund + if auxes[0].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # pretérito imperfeito (aux estar) -> PhraseTense=Past, PhraseAspect=ImpProg + # subjunctive pretérito imperfeito (aux estar) -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMPPROG.value + + # pretérito perfeito (aux estar) -> PhraseTense=Past, PhraseAspect=PerfProg + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect=Aspect.PERFPROG.value + + # conditional (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Cnd + elif auxes[0].feats['Mood'] == 'Cnd': + tense=Tense.PRES.value + aspect=Aspect.PROG.value + + # presente (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog + # futuro do presente (aux estar) -> PhraseTense=Fut, PhraseAspect=Prog + # subjunctive presente (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Sub + # subjunctive futuro (aux estar) -> PhraseTense=Fut, PhraseAspect=Prog, PhraseMood=Sub + else: + tense=auxes[0].feats['Tense'] + aspect=Aspect.PROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + aspect=aspect, + expl=expl, + ords=phrase_ords) + + # Auxiliary 'ter' followed by a participle + if auxes[0].lemma == 'ter' and node.feats['VerbForm'] == 'Part': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # futuro do presente composto (aux ter) -> PhraseTense=Fut, PhraseAspect=Perf + aspect=Aspect.PERF.value + tense=auxes[0].feats['Tense'] + + # pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf + # subjonctive pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf, PhraseMood=Sub + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.PASTPRES.value + + # pretérito mais que perfeito composto (aux ter/haver) -> PhraseTense=Past, PhraseAspect=Pqp + # subjonctive pretérito mais-que-perfeito composto (aux ter) -> PhraseTense=Past, PhraseAspect=Pqp, PhraseMood=Sub + elif auxes[0].feats['Tense'] in ['Imp', 'Past']: # TODO prej neni v Past, jenom Imp + tense=Tense.PAST.value + aspect=Aspect.PQP.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + aspect=aspect, + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords) + + if auxes[0].lemma == 'haver' and auxes[0].feats['Tense'] == 'Imp' and node.feats['VerbForm'] == 'Part': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + self.write_node_info(head_node, + tense=Tense.PAST.value, + aspect=Aspect.PERF.value, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords) + + if auxes[0].lemma == 'vir' and auxes[0].feats['Tense'] in ['Pres', 'Imp', 'Past'] and node.feats['VerbForm'] == 'Ger': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # aux Pres (vir) + gerund -> PhraseTense=PastPres, PraseAspect=Prog + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.PASTPRES.value + + + elif auxes[0].feats['Tense'] in ['Imp', 'Past']: + tense=Tense.PAST.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=Aspect.PROG.value, + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords) + + + # auxiliary 'ir' followed by infinitive + if auxes[0].lemma == 'ir' and node.feats['VerbForm'] == 'Inf': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + tense=node.feats['Tense'] + aspect='' + + # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect='' + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect='' + + # Futuro perifrástico passado imp -> PhraseTense=PastFut, PhraseAspect=Imp + elif auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMP.value + + # Futuro perifrástico in the future -> PhraseTense=FutFut, PhraseAspect='' + elif auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect='' + + # Futuro perifrástico passado perf -> PhraseTense=PastFut, PhraseAspect=Perf + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERF.value + + + + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords) + + if auxes[0].lemma == 'ir' and node.feats['VerbForm'] == 'Ger': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # aux Pres (ir) + gerund -> PhraseTense=Pres, PhraseAspect=Prog + tense = auxes[0].feats['Tense'] + aspect = Aspect.PROG.value + + # aux Imp (ir) + gerund -> PhraseTense=Past, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.PROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + aspect=aspect, + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords) + + elif len(auxes) == 2: + # auxiliry 'ir' followed by auxiliary 'estar' in infinitive and a gerund + if auxes[0].lemma == 'ir' and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect=Aspect.PROG.value + + # Futuro perifrástico passado imp -> PhraseTense=PastFut, PhraseAspect=ImpProg + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMPPROG.value + + # Futuro perifrástico in the future -> PhraseTense=FutFut, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect=Aspect.PROG.value + + if auxes[0].feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERFPROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=aspect, + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords) + + # auxiliriy 'ir' in present or future tense followed by auxiliary 'ter' in infinitive and a participle + if auxes[0].lemma == 'ir' and (auxes[0].feats['Tense'] in ['Pres', 'Fut']) and auxes[1].lemma == 'ter' and node.feats['VerbForm'] == 'Part': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # Futuro perifrástico -> PhraseTense=FutFut, PhraseAspect=Perf + if auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect=Aspect.PERF.value + + # aux Pres (ir) + aux ter inf + pp -> PhraseTense=Fut, PhraseAspect=Perf + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect=Aspect.PERF.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + aspect=aspect, + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords) + + + + # Cnd (only ter), Sub and Past,Pres,Fut tenses: 2 auxes - ter + estar + if auxes[0].lemma in ['ter', 'haver'] and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + tense = auxes[0].feats['Tense'] + aspect = Aspect.PERFPROG.value + + # aux ter cond + estar pp + gerund -> PhraseTense=Past, PhraseAspect=Prog, PhraseMood=Cnd + if auxes[0].feats['Mood'] == 'Cnd': + tense=Tense.PAST.value + aspect=Aspect.PROG.value + + # Pretérito perfeito composto -> PhraseTense=PastPres, PhraseAspect=PerfProg + # subjonctive Pretérito perfeito composto -> PhraseTense=PastPres, PhraseAspect=PerfProg, PhraseMood=Sub + elif auxes[0].feats['Tense'] == 'Pres': + tense=Tense.PASTPRES.value + + # Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg + # subjonctive Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub + elif auxes[0].feats['Tense'] in ['Imp', 'Past']: + tense=Tense.PAST.value + aspect=Aspect.IMPPROG.value + + # Futuro do presente composto -> PhraseTense=Fut, PhraseAspect=PerfProg + elif auxes[0].feats['Tense'] == 'Fut' and auxes[0].lemma == 'ter': + tense=Tense.FUT.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=aspect, + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords, + ) + return + + def process_copulas(self, node, cop, auxes, refl, expl): + + if not auxes: + tense = cop[0].feats['Tense'] + number=cop[0].feats['Number'] + person=cop[0].feats['Person'] + mood=cop[0].feats['Mood'] + + if cop[0].feats['Tense'] in ['Pres', 'Fut']: + if cop[0].lemma == 'ser': + aspect=Aspect.PERF.value + elif cop[0].lemma == 'estar': + aspect=Aspect.IMP.value + + elif cop[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMP.value + + elif cop[0].feats['Tense'] == 'Past': + aspect=Aspect.PERF.value + else: + # i.e. copulas in infinitive + aspect='' + + else: + tense = auxes[0].feats['Tense'] + number=auxes[0].feats['Number'] + person=auxes[0].feats['Person'] + mood=auxes[0].feats['Mood'] + aspect='' + + + if auxes[0].lemma == 'estar': + aspect=Aspect.IMPPROG.value + + phrase_ords = [node.ord] + [x.ord for x in cop] + [x.ord for x in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + self.write_node_info(node, + tense=tense, + number=number, + person=person, + mood=mood, + form='Fin', + aspect=aspect, + voice=node.feats['Voice'], + expl=expl, + ords=phrase_ords, + ) diff --git a/udapi/block/msf/slavic/conditional.py b/udapi/block/msf/slavic/conditional.py new file mode 100644 index 00000000..89eafd6c --- /dev/null +++ b/udapi/block/msf/slavic/conditional.py @@ -0,0 +1,85 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects conditional verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Conditional(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + if (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'PartRes') or node.feats['VerbForm'] == 'Fin': + # in most Slavic languages, the verb has feats['VerbForm'] == 'Part' but in Polish the verb has feats['VerbForm'] == 'Fin' + + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # list for auxiliary verbs for forming the conditional mood + cop = [x for x in node.children if x.udeprel == 'cop'] # in some cases it may happen that the cop follows the noun, we don't want to these cases in this branch + # in Polish the auxiliary verbs for conditional mood have deprel == 'aux:cnd', in other languages the auxiliary verbs have x.feats['Mood'] == 'Cnd' + + # the conditional mood can be formed using the auxiliary verb or some conjunctions (such as 'aby, kdyby...' in Czech) + # so x.udeprel == 'aux' can't be required because it doesn't meet the conjunctions + + if aux_cnd and not cop: + aux = [x for x in node.children if x.udeprel == 'aux' or x.feats['Mood'] == 'Cnd'] # all auxiliary verbs and conjuctions with feats['Mood'] == 'Cnd' + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + auxVerb = aux_cnd[0] + + person='3' # TODO there is a problem in russian etc. (same as in past tense) + if auxVerb.feats['Person'] != '': + person=auxVerb.feats['Person'] + + + self.write_node_info(node, + person=person, + number=node.feats['Number'], + mood='Cnd', + form='Fin', + aspect=node.feats['Aspect'], + reflex=self.get_is_reflex(node,refl), + polarity=self.get_polarity(phrase_nodes), + voice=self.get_voice(node, refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and (x.feats['VerbForm'] == 'Part' or x.feats['VerbForm'] == 'Fin')] + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel=='aux:cnd'] + + if cop and aux_cnd: + # there can be a copula with Mood='Cnd' (i. e. in Old East Slavonic), we don't want to count these copula in phrase_ords twice, so there is x.udeprel != 'cop' in aux list + aux = [x for x in node.children if (x.udeprel == 'aux' or x.feats['Mood'] == 'Cnd') and x.udeprel != 'cop'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + prep + refl + cop + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + copVerb = cop[0] + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Cnd', + form='Fin', + voice=self.get_voice(copVerb, refl), + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node, refl), + ords=phrase_ords, + gender=copVerb.feats['Gender'], + animacy=copVerb.feats['Animacy'] + ) \ No newline at end of file diff --git a/udapi/block/msf/slavic/converb.py b/udapi/block/msf/slavic/converb.py new file mode 100644 index 00000000..6b725d56 --- /dev/null +++ b/udapi/block/msf/slavic/converb.py @@ -0,0 +1,91 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects converb (transgressive) forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Converb(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # condition node.upos == 'VERB' to prevent copulas from entering this branch + if node.feats['VerbForm'] == 'Conv' and node.upos == 'VERB': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + form='Conv', + tense=node.feats['Tense'], + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + voice=self.get_voice(node, refl) + ) + + # passive voice + elif node.upos == 'ADJ': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] == 'Conv'] + + if aux: + auxVerb = aux[0] + + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + form='Conv', + tense=auxVerb.feats['Tense'], + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=auxVerb.feats['Gender'], + animacy=auxVerb.feats['Animacy'], + voice='Pass' + ) + + # copulas + else: + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['VerbForm'] == 'Conv'] + + if cop: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + copVerb = cop[0] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + tense=copVerb.feats['Tense'], + gender=copVerb.feats['Gender'], + animacy=copVerb.feats['Animacy'], + form='Conv', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + voice=self.get_voice(copVerb, refl) + ) diff --git a/udapi/block/msf/slavic/future.py b/udapi/block/msf/slavic/future.py new file mode 100644 index 00000000..02452c36 --- /dev/null +++ b/udapi/block/msf/slavic/future.py @@ -0,0 +1,200 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects future tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Future(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # future tense for Serbian and Croatian + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres' and (x.lemma == 'hteti' or x.lemma == 'htjeti')] + if node.upos != 'AUX' and aux: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + aux_other = [x for x in node.children if x.udeprel == 'aux'] # adding aux for passive voice + cop = [x for x in node.children if x.deprel == 'cop'] + + phrase_nodes = [node] + refl + aux_other + cop + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + if not cop: + self.write_node_info(node, + tense='Fut', + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], # srbstina ani chorvatstina vidy nema + form='Fin', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + ords=phrase_ords + ) + else: + prep = [x for x in node.children if x.upos == 'ADP'] + phrase_nodes += prep + phrase_ords += [x.ord for x in prep] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + ords=phrase_ords + ) + + return + + # Macedonian forms the future tense with the auxiliary word ќе and a verb in the present tense + # Bulgarian forms the future tense with the auxiliary word ще and a verb in the present tense + aux = [x for x in node.children if x.lemma == 'ќе' or x.lemma == 'ще'] + + if node.feats['Tense'] == 'Pres' and aux: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords + ) + return + + # future tense of perfect verbs + # Upper Sorbian forms the future tense in this way, however, the feats[Aspect] are not listed in the data + # in some languages ​​(e.g. in Russian) these verbs have the Tense Fut, in others (e.g. in Czech) they have the Tense Pres + """if node.feats['Aspect'] == 'Perf' and (node.feats['Tense'] == 'Pres' or node.feats['Tense'] == 'Fut') and node.feats['VerbForm'] != 'Conv': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + form='Fin', + aspect='Perf', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords + ) + return""" + + + # future tense of imperfect verbs and passive voice + # in some languages ​​the verb is in the infinitive, in some it is in the l-participle + # the condition node.upos == 'ADJ' is due to the passive voice - the n-participle is marked as ADJ, but the auxiliary verb is not cop, but aux + if node.upos == 'VERB' or node.upos == 'ADJ': + + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Fut'] + + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + if aux: + auxVerb = aux[0] + self.write_node_info(node, + tense='Fut', + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + return + + # simple future tense - e.g. in Serbian, the future tense can be formed by combining a verb with a full meaning and an auxiliary verb into one word, i.e. without an auxiliary verb + # or verbs like pojede, půjdeme... in Czech + + if not aux and node.feats['Tense'] == 'Fut': + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Tense'] == 'Fut'] + if cop: + copVerb = cop[0] + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Mood']=='Ind'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + aux + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + tense='Fut', + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Ind', + form='Fin', + voice=self.get_voice(copVerb, refl), + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords + ) + diff --git a/udapi/block/msf/slavic/imperative.py b/udapi/block/msf/slavic/imperative.py new file mode 100644 index 00000000..d4fedd50 --- /dev/null +++ b/udapi/block/msf/slavic/imperative.py @@ -0,0 +1,86 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects imperative verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Imperative(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # the condition node.upos == 'VERB' ensures that copulas do not enter this branch + if node.feats['Mood'] == 'Imp' and node.upos == 'VERB': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + aspect=node.feats['Aspect'], + mood='Imp', + form='Fin', + voice='Act', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords + ) + return + + # verbs in the passive forms are marked as ADJ + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Mood'] == 'Imp'] + if aux: + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Imp', + voice='Pass', + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Mood'] == 'Imp'] + if cop: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + copVerb = cop[0] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Imp', + form='Fin', + voice=self.get_voice(copVerb, refl), + reflex=self.get_is_reflex(node, refl), + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords + ) diff --git a/udapi/block/msf/slavic/infinitive.py b/udapi/block/msf/slavic/infinitive.py new file mode 100644 index 00000000..f39a2646 --- /dev/null +++ b/udapi/block/msf/slavic/infinitive.py @@ -0,0 +1,103 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects infinitive verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Infinitive(udapi.block.msf.phrase.Phrase): + + def process_node(self,node): + if node.feats['VerbForm'] == 'Inf' and node.upos == 'VERB': + aux = [x for x in node.children if x.udeprel == 'aux'] + if not aux: # the list of auxiliary list must be empty - we don't want to mark infinitives which are part of any other phrase (for example the infinititive is part of the future tense in Czech) + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes == neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice=self.get_voice(node,refl), + form='Inf', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords + ) + return + + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] == 'Inf'] + aux_forb = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] != 'Inf'] + if aux and not aux_forb: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice='Pass', + form='Inf', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node, refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + number=node.feats['Number'] + ) + return + + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['VerbForm'] == 'Inf'] + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + if cop and not aux_forb: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=cop[0].feats['Aspect'], + voice=self.get_voice(cop[0], refl), + form='Inf', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node, refl), + ords=phrase_ords + ) + + # there is a rare verb form called supine in Slovenian, it is used instead of infinitive as the argument of motion verbs + if node.feats['VerbForm'] == 'Sup': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice='Act', + form='Sup', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node, refl), + ords=phrase_ords + ) diff --git a/udapi/block/msf/slavic/past.py b/udapi/block/msf/slavic/past.py new file mode 100644 index 00000000..423bff45 --- /dev/null +++ b/udapi/block/msf/slavic/past.py @@ -0,0 +1,207 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects past tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Past(udapi.block.msf.phrase.Phrase): + + def get_person_for_langs_with_simple_past(self, node, person): + """ + returns the person which is known from subject, languages with the simple past tense (e. g. Russian) do not express person in these verb forms + if the person was not taken from the subject, the third person would be filled in automatically due to languages ​​with a compound past but simple forms for the third person (e. g. Czech) + """ + subj = [x for x in node.children if x.udeprel == 'nsubj'] + if subj: + subj = subj[0] + if subj.feats['Person'] != '': + person = subj.feats['Person'] + return person + + def process_node(self, node): + + past_tenses = ['Past', 'Imp', 'Pqp'] + cop = [x for x in node.children if x.udeprel == 'cop' and (x.feats['Tense'] in past_tenses)] + + # there is person 0 in Polish and Ukrainian which is for impersonal statements + # in Polish, verbs with Person=0 have also Tense=Past, in Ukrainian the tense is not specified + if node.feats['Person'] == '0': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense=node.feats['Tense'], + person=node.feats['Person'], + number=node.feats['Number'], + mood=node.feats['Mood'], + voice='Act', #In Polish, impersonal statements are annotated with Voice=Act. In Ukrainian, the Voice feature is missing; therefore, we decided to annotate these phrases with PhraseVoice=Act + aspect=node.feats['Aspect'], + form=node.feats['VerbForm'], + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + + # compound past tense + if (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'PartRes') and node.upos == 'VERB' and node.feats['Voice'] != 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] + aux_pqp = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] in past_tenses] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + aux_pqp + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + aux_cnd = [x for x in node.children if (x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd') and x.udeprel != 'conj'] # we don't want to mark l-participles in the conditional as past tense + if not aux_cnd: + if aux: + person = aux[0].feats['Person'] + + elif not aux: + person = '3' + + if aux_pqp: + person = aux_pqp[0].feats['Person'] + + # in Slovenian, the participles are not annotated as Tense='Past', the Tense feature is missing here + # but in Bulgarian, there are cases where the participles are annotated as Tense='Imp' + tense = 'Past' + if node.feats['Tense'] == 'Imp': + tense = 'Imp' + if node.feats['Tense'] == 'Pqp': + tense = 'Pqp' + + self.write_node_info(node, + tense=tense, + person=person, + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + + + # the past tense of some Slavic languages ​​is formed only by a verb without an auxiliary verb (e.g. Polish) + # or imperfect (special case of the past tense) e.g. in Bulgarian or Croatian + elif (node.feats['Tense'] in past_tenses) and node.upos == 'VERB' and node.feats['VerbForm'] != 'Conv': + + # the past tense is formed only by a content verb, not with an auxiliary + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + + if not aux_forb: + + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense=node.feats['Tense'], + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form=node.feats['VerbForm'], + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + + + + # passive + elif node.upos == 'ADJ' and node.feats['Voice'] == 'Pass' and not cop: + aux_past_tense = [x for x in node.children if x.udeprel == 'aux' and (x.feats['Tense'] in past_tenses)] + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # we don't want to mark l-participles in the conditional as past tense + if not aux_cnd: + if aux_past_tense: + aux_pres_tense = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] # e. g. the auxiliary 'jsem' in the phrase 'byl jsem přinucen' + + phrase_nodes = [node] + aux_past_tense + aux_pres_tense + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person = '3' + if aux_pres_tense: + person = aux_pres_tense[0].feats['Person'] + person = self.get_person_for_langs_with_simple_past(node, person) + + self.write_node_info(node, + tense=aux_past_tense[0].feats['Tense'], + person=person, + number=aux_past_tense[0].feats['Number'], + mood='Ind', + voice='Pass', + form='Fin', + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + + else: + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # we don't want to mark l-participles in the conditional as past tense + if cop and not aux_cnd: + aux_past_tense = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux_past_tense + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person = '3' + if aux_past_tense: + person = aux_past_tense[0].feats['Person'] + + # In ru, be, uk, the person is not expressed in past tense and the verbform is Fin, not Part + if cop[0].feats['VerbForm'] == 'Fin': + person = '' + + self.write_node_info(node, + aspect=cop[0].feats['Aspect'], + tense=cop[0].feats['Tense'], + person=person, + number=cop[0].feats['Number'], + mood='Ind', + voice=self.get_voice(cop[0], refl), + form='Fin', + reflex=self.get_is_reflex(node,refl), + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=cop[0].feats['Gender'], + animacy=cop[0].feats['Animacy'] + ) diff --git a/udapi/block/msf/slavic/preprocessor.py b/udapi/block/msf/slavic/preprocessor.py new file mode 100644 index 00000000..804a081f --- /dev/null +++ b/udapi/block/msf/slavic/preprocessor.py @@ -0,0 +1,83 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block serves as a preprocessor for Slavic languages before the other blocks +are applied to detect periphrastic verb forms. It improves harmonization of +annotations across the treebanks by addressing some known divergences. +""" + +from udapi.core.block import Block + +class Preprocessor(Block): + + def process_node(self,node): + + # in Ukrainian the active verb forms are not marked as PhraseVoice=Act + if (node.upos == 'VERB' or (node.upos == 'AUX' and node.feats['VerbForm'] == 'Fin')) and node.feats['Voice'] == '': + node.feats['Voice'] = 'Act' + + # in some languages, participles are annotated with UPOS=VERB, while in others they are annotated with UPOS=ADJ + # we change the UPOS to ADJ when a participle expresses case + if node.upos == 'VERB' and node.feats['VerbForm'] == 'Part' and node.feats['Case'] != '': + node.upos = 'ADJ' + + # in Polish, the conditional mood for auxiliary verbs is marked as deprel == 'aux:cnd' and not as in the last Slavic languages ​​feats['Mood'] == 'Cnd' + if node.deprel == 'aux:cnd': + node.feats['Mood'] = 'Cnd' + + # unify polarities - some languages ​​mark only Neg (Russian), some mark both Neg and Pos (Czech) + if node.feats['Polarity'] == 'Pos': + node.feats['Polarity'] = '' + + # In Ukrainian, there is no explicit annotation of reflexive verbs + # We decided to unify the annotation of reflexive verbs with Russian and Belarusian, where reflexive verbs are formed similarly + # We add the feature Voice=Mid to reflexive verbs + if node.upos == 'VERB' and (node.lemma.endswith('сь') or node.lemma.endswith('ся')): + node.feats['Voice'] = 'Mid' + + # makedonstina tvori budouci cas pomoci pomocneho slova ќе, u nejz neni nijak vyznaceno, ze se podili na tvorbe budouciho casu + # stejne tak bulharstina pomoci pomocneho slova ще + # makedonstina a bulharstina + if node.feats['Tense'] == 'Pres': + aux = [x for x in node.children if x.lemma == 'ќе' or x.lemma == 'ще'] + if len(aux) == 1: + aux[0].feats['Tense'] = 'Fut' + + # in Czech and in Old Church Slavonic, the participles are sometimes marked with the plural gender + if node.feats['Gender'] == 'Fem,Neut' or node.feats['Gender'] == 'Fem,Masc': + subj = [x for x in node.children if x.udeprel == 'nsubj'] + + # for relative pronouns, only one gender is indicated + if len(subj) == 1: + conj = [x for x in subj[0].children if x.deprel == 'conj'] + if len(conj) == 0: + node.feats['Gender'] = subj[0].feats['Gender'] + node.feats['Number'] = subj[0].feats['Number'] + + # participles in passive are sometimes annotated as VERB, sometimes as ADJ + if node.upos == 'VERB' and node.feats['Voice'] == 'Pass': + node.upos = 'ADJ' + + # there are cases where the node has deprel=='expl:pv' or 'expl:pass' or 'expl:impers' and Reflex is not Yes (i.e. Macedonian treebank) + # we add the Reflex=Yes feature + if node.deprel == 'expl:pv' or node.deprel == 'expl:pass' or node.deprel == 'expl:impers': + node.feats['Reflex'] = 'Yes' + + # fixing the mistake in Macedonian treebank (mk_mtb-ud-test.conllu), in sent_id=other0010, there is personal pronoun 'ми' marked as expl:pv, it should be iobj + if node.deprel == 'expl:pv' and node.lemma == 'ми' and node.feats['PronType'] == 'Prs': + node.deprel = '' + node.udeprel = 'iobj' + + # in Old Church Slavonic, there is feature Mood=Sub, but this is a notation for conditional mood + if node.feats['Mood'] == 'Sub': + node.feats['Mood'] = 'Cnd' + + # although infinitives in Old Church Slavonic are annotated with Tense=Pres, they do not convey tense; therefore, we remove this annotation + if node.feats['VerbForm'] == 'Inf': + node.feats['Tense'] = '' + + # in the russian Syntagrus corpus, the negative particles have no Polarity=Neg feature + if node.lemma == 'не' and node.upos == 'PART' and node.udeprel == 'advmod': + node.feats['Polarity'] = 'Neg' + + # TODO maybe we want to set Tense=Fut for the perfective verbs with Tense=Pres? This could solve the problem with the simplified detection of the future tense in Czech + # but there are many verbs with no Aspect value, so the problem is still there diff --git a/udapi/block/msf/slavic/present.py b/udapi/block/msf/slavic/present.py new file mode 100644 index 00000000..9a743a9e --- /dev/null +++ b/udapi/block/msf/slavic/present.py @@ -0,0 +1,128 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects present tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Present(udapi.block.msf.phrase.Phrase): + + def process_node(self,node): + # the condition VerbForm == 'Fin' ensures that there are no transgressives between the found verbs + # the aspect is not always given in Czech treebanks, so we can't rely on the fact that the imperfect aspect is specified + if node.feats['Tense'] == 'Pres' and node.upos == 'VERB' and node.feats['VerbForm'] == 'Fin': #and node.feats['Aspect']=='Imp': + + aux_forb = [x for x in node.children if x.upos == 'AUX' and (x.lemma == 'ќе' or x.lemma == 'ще' or x.feats['Mood'] == 'Cnd')] # forbidden auxiliaries for present tense (these auxiliaries are used for the future tense or the conditional mood) + + if not aux_forb: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Pres', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + aspect=node.feats['Aspect'], + voice=self.get_voice(node,refl), + form='Fin', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords + ) + return + + # passive voice + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres' and x.lemma != 'hteti' and x.lemma != 'htjeti'] + aux_forb = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] != 'Pres'] # we don't want the past passive (e. g. 'byl jsem poučen' in Czech) + + if aux and not aux_forb: + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + auxVerb = aux[0] + + self.write_node_info(node, + tense='Pres', + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + mood='Ind', + aspect=node.feats['Aspect'], + form='Fin', + voice='Pass', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + return + + # participles + # in some languages, participles are used as attributes (they express case and degree) + if node.upos == 'ADJ' and node.feats['VerbForm'] == 'Part': + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + cop = [x for x in node.children if x.udeprel == 'cop'] + + if not aux_forb and not cop: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + tense=node.feats['Tense'], + number=node.feats['Number'], + form='Part', + voice=self.get_voice(node, refl), + reflex=self.get_is_reflex(node, refl), + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords + ) + return + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Tense'] == 'Pres'] + aux_forb = [x for x in node.children if x.upos == 'AUX' and x.feats['Tense'] != 'Pres'] # in Serbian this can be a future tense + + if cop and not aux_forb: + aux = [x for x in node.children if x.udeprel == "aux" and x.feats['Mood'] == 'Ind' and x.feats['Tense'] == 'Pres'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + aux + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + copVerb = cop[0] + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + tense='Pres', + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Ind', + form='Fin', + voice=self.get_voice(copVerb, refl), + reflex=self.get_is_reflex(node, refl), + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords + ) diff --git a/udapi/block/mwe/normalize.py b/udapi/block/mwe/normalize.py new file mode 100644 index 00000000..e7ebf24f --- /dev/null +++ b/udapi/block/mwe/normalize.py @@ -0,0 +1,68 @@ +"""Block that takes PARSEME-like annotation of multiword expressions from MISC + and normalizes it so that the type is always annotated at the first word of + the expression.""" +from udapi.core.block import Block +import logging +import re + +class Normalize(Block): + + def collect_mwes(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + The expected annotation is in the style of Parseme (see + https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download + the data from http://hdl.handle.net/11372/LRT-5124), except that there + are only ten columns and the annotation from the eleventh column is + copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause). + """ + nodes = root.descendants + mwes = {} # for each mwe id, its type and list of node ids + mwes_by_nodes = {} # for each node id, a list of mwe ids + for n in nodes: + mwes_by_nodes[n.ord] = [] + miscmwe = n.misc['Mwe'] + if miscmwe: + # A node may belong to multiple multiword expressions. + miscmwes = miscmwe.split(';') + for m in miscmwes: + # Either it is NUMBER:TYPE, or just NUMBER. + # Number identifies this MWE among all MWEs in the sentence. + # Type is a main uppercase string (VID, LVC etc.), optionally + # followed by a subtype ('LVC.cause'). + # See https://gitlab.com/parseme/corpora/-/wikis/home + match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m) + if match: + number = match.group(1) + type = match.group(2) + if not number in mwes: + mwes[number] = {'nodes': [], 'type': ''} + if type: + mwes[number]['type'] = type + mwes[number]['nodes'].append(n.ord) + mwes_by_nodes[n.ord].append(number) + else: + logging.warning("Cannot parse Mwe=%s" % m) + return (mwes, mwes_by_nodes) + + def process_tree(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + Then saves them back but makes sure that the type is annotated at the + first word of the expression (as opposed to the syntactic head or to + any other word). + """ + (mwes, mwes_by_nodes) = self.collect_mwes(root) + nodes = root.descendants + for n in nodes: + # Erase the previous MWE annotations so we can start from scratch. + n.misc['Mwe'] = '' + # There may be multiple MWEs this node is member of. + annotations = [] + for m in mwes_by_nodes[n.ord]: + if n.ord == mwes[m]['nodes'][0]: + annotations.append("%s:%s" % (m, mwes[m]['type'])) + else: + annotations.append(m) + if annotations: + n.misc['Mwe'] = ';'.join(annotations) diff --git a/udapi/block/mwe/possessives.py b/udapi/block/mwe/possessives.py new file mode 100644 index 00000000..0849a210 --- /dev/null +++ b/udapi/block/mwe/possessives.py @@ -0,0 +1,74 @@ +"""Block that takes PARSEME-like annotation of multiword expressions from MISC, + looks for dependent possessive pronouns and reports how they are treated.""" +from udapi.core.block import Block +import logging +import re + +class Possessives(Block): + + def collect_mwes(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + The expected annotation is in the style of Parseme (see + https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download + the data from http://hdl.handle.net/11372/LRT-5124), except that there + are only ten columns and the annotation from the eleventh column is + copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause). + """ + nodes = root.descendants + mwes = {} # for each mwe id, its type and list of node ids + mwes_by_nodes = {} # for each node id, a list of mwe ids + for n in nodes: + mwes_by_nodes[n.ord] = [] + miscmwe = n.misc['Mwe'] + if miscmwe: + # A node may belong to multiple multiword expressions. + miscmwes = miscmwe.split(';') + for m in miscmwes: + # Either it is NUMBER:TYPE, or just NUMBER. + # Number identifies this MWE among all MWEs in the sentence. + # Type is a main uppercase string (VID, LVC etc.), optionally + # followed by a subtype ('LVC.cause'). + # See https://gitlab.com/parseme/corpora/-/wikis/home + match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m) + if match: + number = match.group(1) + type = match.group(2) + if not number in mwes: + mwes[number] = {'nodes': [], 'type': ''} + if type: + mwes[number]['type'] = type + mwes[number]['nodes'].append(n.ord) + mwes_by_nodes[n.ord].append(number) + else: + logging.warning("Cannot parse Mwe=%s" % m) + return (mwes, mwes_by_nodes) + + def process_tree(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + Then surveys the possessive pronouns. + """ + (mwes, mwes_by_nodes) = self.collect_mwes(root) + nodes = root.descendants + for m in mwes: + mwenodes = [x for x in nodes if m in mwes_by_nodes[x.ord]] + mweheads = [x for x in mwenodes if not x.parent in mwenodes] + mwedescendantset = set() + for x in mweheads: + mwedescendantset = mwedescendantset.union(set(x.descendants)) + mwedescendants = list(sorted(mwedescendantset)) + # Is there a possessive pronoun? + possprons = [x for x in mwedescendants if x.upos == 'PRON' and x.feats['Poss'] == 'Yes'] + inpp = [x for x in possprons if m in mwes_by_nodes[x.ord]] + outpp = [x for x in possprons if not m in mwes_by_nodes[x.ord]] + observation = '' + if inpp and outpp: + observation = 'both' + elif inpp: + observation = 'in' + elif outpp: + observation = 'out' + if observation: + expression = ' '.join([x.form if m in mwes_by_nodes[x.ord] else '('+x.form+')' for x in mwedescendants]) + print(observation + ': ' + expression) diff --git a/udapi/block/mwe/tosubdeprels.py b/udapi/block/mwe/tosubdeprels.py new file mode 100644 index 00000000..3682c0c7 --- /dev/null +++ b/udapi/block/mwe/tosubdeprels.py @@ -0,0 +1,62 @@ +"""Block that takes PARSEME-like annotation of multiword expressions from MISC + and projects it to subtypes of dependency relation labels. The motivation is + that a parser could learn to predict the multiword expressions.""" +from udapi.core.block import Block +import logging +import re + +class ToSubDeprels(Block): + + def collect_mwes(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + The expected annotation is in the style of Parseme (see + https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download + the data from http://hdl.handle.net/11372/LRT-5124), except that there + are only ten columns and the annotation from the eleventh column is + copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause). + """ + nodes = root.descendants + mwes = {} # for each mwe id, its type and list of node ids + mwes_by_nodes = {} # for each node id, a list of mwe ids + for n in nodes: + mwes_by_nodes[n.ord] = [] + miscmwe = n.misc['Mwe'] + if miscmwe: + # A node may belong to multiple multiword expressions. + miscmwes = miscmwe.split(';') + for m in miscmwes: + # Either it is NUMBER:TYPE, or just NUMBER. + # Number identifies this MWE among all MWEs in the sentence. + # Type is a main uppercase string (VID, LVC etc.), optionally + # followed by a subtype ('LVC.cause'). + # See https://gitlab.com/parseme/corpora/-/wikis/home + match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m) + if match: + number = match.group(1) + type = match.group(2) + if not number in mwes: + mwes[number] = {'nodes': [], 'type': ''} + if type: + mwes[number]['type'] = type + mwes[number]['nodes'].append(n.ord) + mwes_by_nodes[n.ord].append(number) + else: + logging.warning("Cannot parse Mwe=%s" % m) + return (mwes, mwes_by_nodes) + + def process_tree(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + Then saves the type of the MWE as a subtype of the deprels inside. + """ + nodes = root.descendants + (mwes, mwes_by_nodes) = self.collect_mwes(root) + # Now we hopefully know the type of every multiword expression in the sentence. + for n in nodes: + if mwes_by_nodes[n.ord]: + for m in mwes_by_nodes[n.ord]: + type = re.sub(r"\.", '', mwes[m]['type'].lower()) + # Add the MWE type to the DEPREL if the parent is also in the same MWE. + if n.parent.ord > 0 and m in mwes_by_nodes[n.parent.ord]: + n.deprel += ':' + type diff --git a/udapi/block/read/addtext.py b/udapi/block/read/addtext.py new file mode 100644 index 00000000..040174be --- /dev/null +++ b/udapi/block/read/addtext.py @@ -0,0 +1,59 @@ +"""read.AddText is a reader for adding word-wrapped plain-text to existing trees.""" +from udapi.core.basereader import BaseReader +from udapi.core.root import Root +import logging + +class AddText(BaseReader): + r"""A reader for plain-text files to be stored to existing trees. + + For example LitBank conll files are segmented to sentences and tokenized, + but the SpacesAfter attributes are missing. We need to load the original + (raw) texts, which are not tokenized and not segmented, only word-wrapped + (to 70 characters per line). + + Args: + add_newpar: add newpar CoNLL-U annotations on empty lines (and the beginning of file) + """ + def __init__(self, zone='', add_newpar=True, **kwargs): + super().__init__(zone=zone, **kwargs) + self.add_newpar = add_newpar + + @staticmethod + def is_multizone_reader(): + """Can this reader read bundles which contain more zones?. + + This implementation returns always False. + """ + return False + + def process_document(self, document): + filehandle = self.next_filehandle() + if filehandle is None: + self.finished = True + return + text = ''.join(self.filehandle.readlines()) + i, end, was_newpar = 0, len(text), True + while i <= end and text[i].isspace(): + i += 1 + + for bundle in document.bundles: + root = bundle.get_tree(zone=self.zone) + if self.add_newpar and was_newpar: + root.newpar = True + was_newpar = False + for node in root.token_descendants: + if text[i:i+len(node.form)] == node.form: + i += len(node.form) + if i > end or text[i].isspace(): + del node.misc['SpaceAfter'] + was_newpar = i+1 < end and text[i+1] == '\n' and text[i] == '\n' + while i <= end and text[i].isspace(): + i += 1 + else: + node.misc['SpaceAfter'] = 'No' + was_newpar = False + else: + logging.warning('Node %s does not match text "%s"', node, text[i:i+20]) + return + root.text = root.compute_text() + self.finished = not self.files.has_next_file() diff --git a/udapi/block/read/ccv.py b/udapi/block/read/ccv.py new file mode 100644 index 00000000..eb449362 --- /dev/null +++ b/udapi/block/read/ccv.py @@ -0,0 +1,78 @@ +"""Ccv class is a reader for Corpus of Czech Verse json files.""" +from udapi.core.basereader import BaseReader +from udapi.core.root import Root +from udapi.block.ud.setspaceafterfromtext import SetSpaceAfterFromText +import json + +class Ccv(BaseReader): + r"""A reader for Corpus of Czech Verse json files. + + See https://github.com/versotym/corpusCzechVerse + Each verse (line) is stored as one tree (although it is quite often not a whole sentence). + Start of each stanza is marked with `newpar`. + Start of each poem is marked with `newdoc = [poem_id]`. + + Args: + tokenize: create nodes + """ + def __init__(self, tokenize=True, **kwargs): + self.tokenize = tokenize + self._cache = None + super().__init__(**kwargs) + + @staticmethod + def is_multizone_reader(): + """Can this reader read bundles which contain more zones?. + + This implementation returns always False. + """ + return False + + def read_tree(self): + if self._cache: + return self._cache.pop() + else: + trees = self.read_trees() + if not trees: + return None + self._cache = list(reversed(trees[1:])) + return trees[0] + + def read_trees(self): + if self.filehandle is None: + return None + poems = json.load(self.filehandle) + all_trees = [] + for poem in poems: + poem_trees = [] + for stanza in poem["body"]: + stanza_trees = [] + for line in stanza: + root = Root() + root.text = line["text"] + root.json["rhyme"] = line["rhyme"] + root.json["metre"] = line["metre"] + root.json["stress"] = line["stress"] + stanza_trees.append(root) + if self.tokenize: + words = [[]] + [[w] for w in line["words"]] + for index, puncts in line["punct"].items(): + for punct in puncts: + words[int(index)].append({"token": punct, "lemma": punct}) + for word in words: + for w in word: + node = root.create_child(form=w["token"], lemma=w["lemma"]) + if "morph" in w: + node.xpos = w["morph"] + node.misc["xsampa"] = w["xsampa"] + node.misc["phoebe"] = w["phoebe"] + SetSpaceAfterFromText.process_tree(None, root) + stanza_trees[0].newpar = True + poem_trees.extend(stanza_trees) + root = poem_trees[0] + root.newdoc = poem["poem_id"] + root.json["p_author"] = poem["p_author"] + root.json["b_author"] = poem["b_author"] + root.json["biblio"] = poem["biblio"] + all_trees.extend(poem_trees) + return all_trees diff --git a/udapi/block/read/conll.py b/udapi/block/read/conll.py index f64cd9ff..d0aef1ee 100644 --- a/udapi/block/read/conll.py +++ b/udapi/block/read/conll.py @@ -79,22 +79,24 @@ def parse_node_line(self, line, root, nodes, parents, mwts): # but it allows for arbitrary columns node = root.create_child() for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] if attribute_name == 'head': try: - parents.append(int(fields[n_attribute])) + parents.append(int(value)) except ValueError as exception: - if not self.strict and fields[n_attribute] == '_': + if not self.strict and value == '_': if self.empty_parent == 'warn': logging.warning("Empty parent/head index in '%s'", line) parents.append(0) else: raise exception elif attribute_name == 'ord': - setattr(node, 'ord', int(fields[n_attribute])) + if int(value) != node._ord: + raise ValueError(f"Node {node} ord mismatch: {value}, but expecting {node._ord} at:\n{line}") elif attribute_name == 'deps': - setattr(node, 'raw_deps', fields[n_attribute]) - elif attribute_name != '_' and fields[n_attribute] != '_': - setattr(node, attribute_name, fields[n_attribute]) + setattr(node, 'raw_deps', value) + elif attribute_name != '_' and value != '_': + setattr(node, attribute_name, value) nodes.append(node) @@ -134,11 +136,10 @@ def read_tree_from_lines(self, lines): if node is parent: if self.fix_cycles: logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node) - node._parent = root - root._children.append(node) + parent = root else: raise ValueError(f"Detected a cycle: {node} attached to itself") - elif node.children: + elif node._children: climbing = parent._parent while climbing: if climbing is node: diff --git a/udapi/block/read/conll2012.py b/udapi/block/read/conll2012.py new file mode 100644 index 00000000..f4b73dc8 --- /dev/null +++ b/udapi/block/read/conll2012.py @@ -0,0 +1,143 @@ +""""Conll2012 is a reader block for the coreference in CoNLL-2012 format. + +This implementation was tested on the LitBank files only +(and quickly on Portuguese Corref-PT and Summ-it++v2), so far. +LitBank does not use most of the columns, so the implementation +should be improved to handle other types of CoNLL-2012 files. +""" +import json +import logging +import re + +import udapi.block.read.conllu +from udapi.core.root import Root +from udapi.core.node import Node + +RE_BEGIN = re.compile(r'^#begin document ([^ ]+)') + +class Conll2012(udapi.block.read.conllu.Conllu): + """A reader of the Conll2012 files.""" + + def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', **kwargs): + """Create the Conll2012 reader object. + + Args: + attributes: comma-separated list of column names in the input files + (default='docname,_,ord,form,_,_,_,_,_,_,_,_,coref' suitable for LitBank) + For ignoring a column, use "_" as its name. + Column "ord" marks the column with 0-based (unlike in CoNLL-U, which uses 1-based) + word-order number/index (usualy called ID). + For Corref-PT-SemEval, use attributes='ord,form,_,_,_,_,coref'. + For Summ-it++v2, use attributes='ord,form,_,_,_,_,_,_,coref'. + """ + super().__init__(**kwargs) + self.node_attributes = attributes.split(',') + self._docname = 'd' + + def parse_comment_line(self, line, root): + if line.startswith("#end document"): + return + match = RE_BEGIN.match(line) + if match: + docname = match.group(1) + # LitBank uses e.g. + # #begin document (1023_bleak_house_brat); part 0 + if docname.startswith('(') and docname.endswith(');'): + docname = docname[1:-2] + # Summ-it++v2 uses e.g. + # #begin document /home/andre/Recursos-fontes/Summit/Summ-it_v3.0/corpusAnotado_CCR/CIENCIA_2002_22010/CIENCIA_2002_22010.txt + elif docname.startswith('/home/'): + docname = docname.split('/')[-1] + # Corref-PT-SemEval uses e.g. + # #begin document D1_C30_Folha_07-08-2007_09h19.txt.xml + docname = docname.replace('.txt', '').replace('.xml', '') + + root.newdoc = docname + self._global_entity = 'eid-etype-head-other' + root.comment += '$GLOBAL.ENTITY\n' + self._docname = docname + else: + logging.warning(f"Unexpected comment line: {line}") + + def parse_node_line(self, line, root, nodes): + fields = line.split('\t') + if len(fields) != len(self.node_attributes): + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + + # This implementation is slower than in read.Conllu, + # but it allows for arbitrary columns + node = root.create_child() + for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] + if attribute_name == 'docname': + if value != self._docname: + logging.warning(f"Document name mismatch {value} != {self._docname}") + + # convert the zero-based index to one-based + # but Corref-PT uses a mix of one-based and zero-based + elif attribute_name == 'ord': + #setattr(node, 'ord', int(value) + 1) + if node.ord not in(int(value) + 1, int(value)): + logging.warning(f"Mismatch: expected {node.ord=}, but found {int(value) + 1} {line=}") + + elif attribute_name == 'coref': + if value and value != '_': + # LitBank always separates chunks by a vertical bar, e.g. (13)|10) + # Summ-it++v2 does not, e.g. (13)10) + if '|' in value: + chunks = value.split("|") + else: + chunks = [x for x in re.split(r'(\([^()]+\)?|[^()]+\))', value) if x] + modified_entities = [] + escaped_docname = self._docname.replace("-", "") + for entity in chunks: + entity_num = entity.replace("(", "").replace(")","") + modified_entity = f"{escaped_docname}_e{entity_num}--1" + if entity.startswith("(") and entity.endswith(")"): + modified_entity = "(" + modified_entity + ")" + elif entity.startswith("("): + modified_entity = "(" + modified_entity + elif entity.endswith(")"): + modified_entity = f"{escaped_docname}_e{entity_num}" + ")" + + # to avoid parentheses clashes, put the entities with ")" first + if modified_entity.startswith("("): + modified_entities.append(modified_entity) + else: + modified_entities.insert(0, modified_entity) + node.misc['Entity'] = ''.join(modified_entities) + + elif attribute_name == 'form' or (attribute_name != '_' and value != '_'): + setattr(node, attribute_name, value) + nodes.append(node) + + def read_tree_from_lines(self, lines): + root = Root() + nodes = [root] + for line in lines: + if line == '': + pass + elif line[0] == '#': + self.parse_comment_line(line, root) + else: + self.parse_node_line(line, root, nodes) + + # If no nodes were read from the filehandle (so only root remained in nodes), + # we return None as a sign of failure (end of file or more than one empty line). + if len(nodes) == 1: + return None + + return root + + def read_trees(self): + if self.max_docs: + raise NotImplementedError("TODO implement max_docs in read.Conll2012") + # Corref-PT does not put an empty line before #end document, + # so we need to split both on #end document and empty lines. + return [self.read_tree_from_lines(s.split('\n')) for s in + re.split(r'\n\n+|\n#end document\n', self.filehandle.read()) if s] + + def read_tree(self): + raise NotImplementedError("TODO implement read_tree in read.Conll2012") diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 71886752..b485c17d 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -12,8 +12,9 @@ # This reader accepts also older-style sent_id (until UD v2.0 treebanks are released). RE_SENT_ID = re.compile(r'^# sent_id\s*=?\s*(\S+)') RE_TEXT = re.compile(r'^# text\s*=\s*(.*)') -RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc)(?:\s+id\s*=\s*(.+))?') +RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc)(?:\s+id\s*=\s*(.+))?$') RE_JSON = re.compile(r'^# (doc_)?json_([^ =]+)\s*=\s*(.+)') +RE_GLOBAL_ENTITY = re.compile(r'^# global.Entity\s*=\s*(\S+)') class Conllu(BaseReader): @@ -33,8 +34,7 @@ def __init__(self, strict=False, empty_parent='warn', fix_cycles=False, **kwargs self.empty_parent = empty_parent self.fix_cycles = fix_cycles - @staticmethod - def parse_comment_line(line, root): + def parse_comment_line(self, line, root): """Parse one line of CoNLL-U and fill sent_id, text, newpar, newdoc in root.""" sent_id_match = RE_SENT_ID.match(line) if sent_id_match is not None: @@ -68,11 +68,45 @@ def parse_comment_line(line, root): container = root.json['__doc__'] container[json_match.group(2)] = json.loads(json_match.group(3)) return + + entity_match = RE_GLOBAL_ENTITY.match(line) + if entity_match is not None: + global_entity = entity_match.group(1) + if self._global_entity and self._global_entity != global_entity: + logging.warning(f"Mismatch in global.Entity: {self._global_entity} != {global_entity}") + self._global_entity = global_entity + root.comment += '$GLOBAL.ENTITY\n' + return + root.comment += line[1:] + "\n" def read_trees(self): - return [self.read_tree_from_lines(s.split('\n')) for s in - self.filehandle.read().split('\n\n') if s] + if not self.max_docs: + # Valid CoNLL-U files must have sentences separated by a single empty line. + # However, some users have to work with invalid files e.g. ending with two empty lines. + # It is obvious how to parse such files and re.split(r'\n\n+', s) is only twice as slow + # as s.split('\n\n') and this time is negligble + # relative to the main CoNLL-U parsing in read_tree_from_lines(). + return [self.read_tree_from_lines(s.split('\n')) for s in + re.split(r'\n\n+', self.filehandle.read()) if s] + # udapi.core.basereader takes care about the max_docs parameter. + # However, we can make the loading much faster by not reading + # the whole file if the user wants just first N documents. + trees, lines, loaded_docs = [], [], 0 + for line in self.filehandle: + line = line.rstrip() + if line == '': + tree = self.read_tree_from_lines(lines) + lines = [] + if tree.newdoc: + if loaded_docs == self.max_docs: + return trees + loaded_docs += 1 + if tree: + trees.append(tree) + else: + lines.append(line) + return trees def read_tree(self): if self.filehandle is None: @@ -167,7 +201,7 @@ def read_tree_from_lines(self, lines): root._children.append(node) else: raise ValueError(f"Detected a cycle: {node} attached to itself") - elif node.children: + elif node._children: climbing = parent._parent while climbing: if climbing is node: @@ -183,7 +217,11 @@ def read_tree_from_lines(self, lines): # Create multi-word tokens. for fields in mwts: - range_start, range_end = fields[0].split('-') + try: + range_start, range_end = fields[0].split('-') + except ValueError: + logging.warning(f"Wrong MWT range in\n{fields[0]}\n\n{lines}") + raise words = nodes[int(range_start):int(range_end) + 1] root.create_multiword_token(words, form=fields[1], misc=fields[-1]) diff --git a/udapi/block/read/conllup.py b/udapi/block/read/conllup.py new file mode 100644 index 00000000..16d83d07 --- /dev/null +++ b/udapi/block/read/conllup.py @@ -0,0 +1,107 @@ +"""Conllup is a reader block for the CoNLL-UPlus format. + +Columns which don't have standardize attributes in Udapi/CoNLL-U +are stored in MISC (as key=value pairs). + +This code has been only tested on Hungarian KorKor files for CorefUD so far. +However, in the end, it is not used there (xtsv files are used instead conllup). +""" +import logging +import re + +import udapi.block.read.conll +from udapi.core.root import Root +from udapi.core.node import Node + +RE_GLOBAL_COLUMNS = re.compile(r'^# global.columns\s*=\s*(.+)') +COLUMN_MAP = { + 'ID': 'ord', +} +NORMAL_ATTRS = 'form lemma upos xpos feats deprel misc'.split() + +class Conllup(udapi.block.read.conll.Conll): + """A reader of the CoNLL-UPlus files.""" + + def __init__(self, attributes='autodetect', save_global_columns=False, **kwargs): + """Create the Conllup reader object. + + Args: + attributes: comma-separated list of column names in the input files + (can be used if the global.columns header is missing or needs to be overriden). + Default='autodetect' which means the column names will be loaded from the global.columns header. + For ignoring a column, use "_" as its name. + save_global_columns: keep the "global.columns" header in root.comments. Default=False. + Note that when saving the output to CoNLL-U, the comment is not needed + and it may be even misleading. It could be helpful only once write.Conllup is implemented + (with the possibility to use the same columns as in the input file). + """ + super().__init__(**kwargs) + self.save_global_columns = save_global_columns + if attributes == 'autodetect': + self.node_attributes = None + else: + self.node_attributes = attributes.split(',') + + def parse_comment_line(self, line, root): + if self.node_attributes is None: + global_columns_match = RE_GLOBAL_COLUMNS.match(line) + if global_columns_match is None: + return super().parse_comment_line(line, root) + global_columns = global_columns_match.group(1) + self.node_attributes = [COLUMN_MAP.get(v, v.lower()) for v in global_columns.split(" ")] + if self.save_global_columns: + root.comment += line[1:] + '\n' + return + return super().parse_comment_line(line, root) + + def parse_node_line(self, line, root, nodes, parents, mwts): + fields = line.split('\t') + if len(fields) != len(self.node_attributes): + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + + # multi-word tokens will be processed later + if '-' in fields[0]: + mwts.append(fields) + return + if '.' in fields[0]: + raise NotImplementedError("Empty nodes in CoNLL-UPlus not implement yet in read.Conllup") + + # This implementation is slower than in read.Conllu, + # but it allows for arbitrary columns + node = root.create_child() + nonstandard_attrs = [] + for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] + if attribute_name == 'head': + if value == '???': + value = 0 + try: + parents.append(int(value)) + except ValueError as exception: + if not self.strict and value == '_': + if self.empty_parent == 'warn': + logging.warning("Empty parent/head index in '%s'", line) + parents.append(0) + else: + raise exception + elif attribute_name == 'ord': + if int(value) != node._ord: + raise ValueError(f"Node {node} ord mismatch: {value}, but expecting {node._ord} at:\n{line}") + elif attribute_name == 'deps': + setattr(node, 'raw_deps', value) + elif value == '_' and attribute_name != 'form': + pass + elif attribute_name == '_': + pass + elif attribute_name in NORMAL_ATTRS: + setattr(node, attribute_name, value) + else: + nonstandard_attrs.append([attribute_name, value]) + + # This needs to be done after node.misc is created (if "misc" in node.attributes) + for attribute_name, value in nonstandard_attrs: + node.misc[attribute_name.capitalize()] = value + + nodes.append(node) diff --git a/udapi/block/read/oldcorefud.py b/udapi/block/read/oldcorefud.py new file mode 100644 index 00000000..73e05f3b --- /dev/null +++ b/udapi/block/read/oldcorefud.py @@ -0,0 +1,119 @@ +"""Reader for CoNLL-U files with the old CorefUD 0.1 style of coreference annotation.""" +import re +import logging +import udapi.block.read.conllu +from udapi.core.coref import CorefEntity, CorefMention, BridgingLinks + +class OldCorefUD(udapi.block.read.conllu.Conllu): + + def __init__(self, replace_hyphen_in_id_with='', **kwargs): + """Create the read.OldCorefUD reader object. + + Args: + substitute_hyphen_in_id_for: string to use as a replacement for hyphens in ClusterId + The new format does not allow hyphens in eid (IDs of entity entities), + so we need to replace them. + """ + super().__init__(**kwargs) + self.replace_hyphen_in_id_with = replace_hyphen_in_id_with + self.orig2new = {} + self.new2orig = {} + + def _fix_id(self, cid): + if not cid or '-' not in cid: + return cid + new_cid = self.orig2new.get(cid) + if new_cid is None: + new_cid = cid.replace('-', self.replace_hyphen_in_id_with) + base, counter = new_cid, 1 + while new_cid in self.new2orig: + counter += 1 + new_cid = f"{base}{counter}" + self.new2orig[new_cid] = cid + self.orig2new[cid] = new_cid + return new_cid + + def process_document(self, doc, strict=True): + super().process_document(doc) + + eid_to_entity = {} + for node in doc.nodes_and_empty: + index, index_str = 0, "" + eid = node.misc["ClusterId"] + if not eid: + index, index_str = 1, "[1]" + eid = node.misc["ClusterId[1]"] + eid = self._fix_id(eid) + while eid: + entity = eid_to_entity.get(eid) + if entity is None: + entity = CorefEntity(eid) + eid_to_entity[eid] = entity + mention = CorefMention(words=[node], entity=entity) + if node.misc["MentionSpan" + index_str]: + mention.span = node.misc["MentionSpan" + index_str] + etype = node.misc["ClusterType" + index_str] + if etype: + if entity.etype is not None and etype != entity.etype: + logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}") + entity.etype = etype + + bridging_str = node.misc["Bridging" + index_str] + if bridging_str: + mention._bridging = BridgingLinks(mention) + for link_str in bridging_str.split(','): + target, relation = link_str.split(':') + target = self._fix_id(target) + if target == eid: + _error("Bridging cannot self-reference the same entity: " + target, strict) + if target not in eid_to_entity: + eid_to_entity[target] = CorefEntity(target) + mention._bridging.append((eid_to_entity[target], relation)) + + split_ante_str = node.misc["SplitAnte" + index_str] + if split_ante_str: + split_antes = [] + # TODO in CorefUD draft "+" was used as the separator, but it was changed to comma. + # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator. + for ante_str in split_ante_str.replace('+', ',').split(','): + ante_str = self._fix_id(ante_str) + if ante_str in eid_to_entity: + if ante_str == eid: + _error("SplitAnte cannot self-reference the same entity: " + eid, strict) + split_antes.append(eid_to_entity[ante_str]) + else: + # split cataphora, e.g. "We, that is you and me..." + ante_cl = CorefEntity(ante_str) + eid_to_entity[ante_str] = ante_cl + split_antes.append(ante_cl) + entity.split_ante = sorted(split_antes) + + # Some CorefUD 0.2 datasets (e.g. ARRAU) separate key-value pairs with spaces instead of commas. + # We also need to escape forbidden characters. + mmisc = node.misc["MentionMisc" + index_str].replace(' ', ',') + mention.other = mmisc.replace('-', '%2D').replace('(', '%28').replace(')', '%29') + index += 1 + index_str = f"[{index}]" + eid = self._fix_id(node.misc["ClusterId" + index_str]) + # c=doc.coref_entities should be sorted, so that c[0] < c[1] etc. + # In other words, the dict should be sorted by the values (according to CorefEntity.__lt__), + # not by the keys (eid). + # In Python 3.7+ (3.6+ in CPython), dicts are guaranteed to be insertion order. + for entity in eid_to_entity.values(): + if not entity._mentions: + _error(f"Entity {entity.eid} referenced in SplitAnte or Bridging, but not defined with ClusterId", strict) + entity._mentions.sort() + doc._eid_to_entity = {c._eid: c for c in sorted(eid_to_entity.values())} + + # Delete all old-style attributes from MISC (so when converting old to new style, the old attributes are deleted). + attrs = "ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() + for node in doc.nodes_and_empty: + for key in list(node.misc): + if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): + del node.misc[key] + + +def _error(msg, strict): + if strict: + raise ValueError(msg) + logging.error(msg) diff --git a/udapi/block/read/sentences.py b/udapi/block/read/sentences.py index 356e196f..7487d580 100644 --- a/udapi/block/read/sentences.py +++ b/udapi/block/read/sentences.py @@ -9,6 +9,8 @@ class Sentences(BaseReader): Args: ignore_empty_lines: if True, delete empty lines from the input. Default=False. + newdoc_if_empty_line: if True, empty lines mark document boundaries, + which are marked with `root.newdoc`. Default=False. rstrip: a set of characters to be stripped from the end of each line. Default='\r\n '. You can use rstrip='\n' if you want to preserve any space or '\r' (Carriage Return) at end of line, @@ -16,8 +18,12 @@ class Sentences(BaseReader): As most blocks do not expect whitespace other than a space to appear in the processed text, using this feature is at your own risk. """ - def __init__(self, ignore_empty_lines=False, rstrip='\r\n ', **kwargs): + def __init__(self, ignore_empty_lines=False, newdoc_if_empty_line=False, + rstrip='\r\n ', **kwargs): + if ignore_empty_lines and newdoc_if_empty_line: + raise ValueError("ignore_empty_lines is not compatible with newdoc_if_empty_line") self.ignore_empty_lines = ignore_empty_lines + self.newdoc_if_empty_line = newdoc_if_empty_line self.rstrip = rstrip super().__init__(**kwargs) @@ -38,11 +44,20 @@ def read_tree(self, document=None): # (or '\r\n' if reading a Windows file on Unix machine). if line == '': return None - if self.ignore_empty_lines: + preceded_by_empty_line = False + if self.ignore_empty_lines or self.newdoc_if_empty_line: while line in {'\n', '\r\n'}: + preceded_by_empty_line = True line = self.filehandle.readline() if line == '': return None root = Root() root.text = line.rstrip(self.rstrip) + if self.newdoc_if_empty_line and preceded_by_empty_line: + root.newdoc = True return root + + # The first line in a file also marks a start of new document + def after_process_document(self, document): + if self.newdoc_if_empty_line: + document.bundles[0].trees[0].newdoc = True diff --git a/udapi/block/read/text.py b/udapi/block/read/text.py new file mode 100644 index 00000000..0213bdcb --- /dev/null +++ b/udapi/block/read/text.py @@ -0,0 +1,57 @@ +"""Text class is a reader for word-wrapped plain-text files.""" +from udapi.core.basereader import BaseReader +from udapi.core.root import Root + + +class Text(BaseReader): + r"""A reader for plain-text files with sentences on one or more lines. + + Sentences are separated by one or more empty lines. + Newlines within sentences are substituted by a space. + + Args: + rstrip: a set of characters to be stripped from the end of each line. + Default='\r\n '. You can use rstrip='\n' if you want to preserve + any space or '\r' (Carriage Return) at end of line, + so that `udpipe.Base` keeps these characters in `SpacesAfter`. + As most blocks do not expect whitespace other than a space to appear + in the processed text, using this feature is at your own risk. + """ + def __init__(self, rstrip='\r\n ', **kwargs): + self.rstrip = rstrip + super().__init__(**kwargs) + + @staticmethod + def is_multizone_reader(): + """Can this reader read bundles which contain more zones?. + + This implementation returns always False. + """ + return False + + def read_tree(self, document=None): + if self.filehandle is None: + return None + lines = [] + line = None + while True: + line = self.filehandle.readline() + # if readline() returns an empty string, the end of the file has been + # reached, while a blank line is represented by '\n' + # (or '\r\n' if reading a Windows file on Unix machine). + if line == '': + if not lines: + return None + else: + break + elif line in {'\n', '\r\n'}: + if not lines: + continue + else: + break + else: + lines.append(line.rstrip(self.rstrip)) + + root = Root() + root.text = " ".join(lines) + return root diff --git a/udapi/block/segment/simple.py b/udapi/block/segment/simple.py index 5f4a8423..58be9b6d 100644 --- a/udapi/block/segment/simple.py +++ b/udapi/block/segment/simple.py @@ -33,8 +33,12 @@ def is_boundary(self, first, second): return False if first[-1] in '"“»›)': first = first[:-1] + if not first: + return False if second[0] in '"„«¿¡‹(': second = second[1:] + if not second: + return False if not second[0].isupper() or second[0].isdigit(): return False if not first[-1] in '.!?': diff --git a/udapi/block/transform/flatten.py b/udapi/block/transform/flatten.py index ded64fb1..d218ad27 100644 --- a/udapi/block/transform/flatten.py +++ b/udapi/block/transform/flatten.py @@ -4,6 +4,22 @@ class Flatten(Block): """Apply `node.parent = node.root; node.deprel = 'root'` on all nodes.""" - def process_node(self, node): - node.parent = node.root - node.deprel = 'root' + def __init__(self, oneroot=False, **kwargs): + """Args: + oneroot: only the first node will have deprel 'root'. + All other nodes will depend on the first node with deprel 'dep'. + This option makes the trees valid according to the validator. + (default=False) + """ + super().__init__(**kwargs) + self.oneroot = oneroot + + def process_tree(self, tree): + for node in tree.descendants: + node.parent = node.root + node.deprel = 'root' + if self.oneroot: + first = tree.descendants[0] + for node in tree.descendants[1:]: + node.parent = first + node.deprel = 'dep' diff --git a/udapi/block/ud/addmwt.py b/udapi/block/ud/addmwt.py index ffa78bbb..996f4dc9 100644 --- a/udapi/block/ud/addmwt.py +++ b/udapi/block/ud/addmwt.py @@ -1,5 +1,6 @@ """Abstract base class ud.AddMwt for heuristic detection of multi-word tokens.""" from udapi.core.block import Block +import logging class AddMwt(Block): @@ -14,6 +15,9 @@ def process_node(self, node): orig_attr[attr] = getattr(node, attr) orig_attr['feats'] = node.feats.copy() orig_attr['misc'] = node.misc.copy() + # Defaults for the newly created MWT + mwt_misc = node.misc.copy() + mwt_form = node.form forms = analysis['form'].split() main = analysis.get('main', 0) @@ -36,12 +40,28 @@ def process_node(self, node): elif orig_attr['form'][0].isupper(): nodes[0].form = nodes[0].form.title() + node.misc = None for attr in 'lemma upos xpos feats deprel misc'.split(): if attr in analysis: values = analysis[attr].split() for i, new_node in enumerate(nodes): + if len(values) <= i: + logging.warning("Attribute '%s' not supplied for word no. %d" % (attr, i)) + for attr in 'form lemma upos xpos feats deprel misc'.split(): + logging.warning("%s = %s" % (attr, analysis.get(attr, ''))) if values[i] == '*': setattr(new_node, attr, orig_attr[attr]) + # No MISC attribute should be duplicated on the word level and token level, + # so if copying MISC to a new_node, delete mwt_misc. + # However, SpaceAfter should be annotated only on the token level, + # so make sure it is not accidentally copied on the word level. + if attr == 'misc': + orig_attr['misc'].clear() + for a in 'SpaceAfter SpacesAfter SpacesBefore'.split(): + if new_node.misc[a]: + orig_attr['misc'][a] = new_node.misc[a] + del new_node.misc[a] + elif attr == 'feats' and '*' in values[i]: new_node.feats = values[i] for feat_name, feat_value in list(new_node.feats.items()): @@ -50,8 +70,23 @@ def process_node(self, node): else: setattr(new_node, attr, values[i]) - mwt = node.root.create_multiword_token(nodes, orig_attr['form'], orig_attr['misc']) - node.misc = None + # Entity (coreference) annotation should be only on the word level, + # so make sure it does not stay on the token level. + if mwt_misc['Entity']: + nodes[0].misc['Entity'] = mwt_misc['Entity'] + del mwt_misc['Entity'] + + # If node is already part of an MWT, we need to delete the old MWT and extend the new MWT. + if node.multiword_token: + mwt_words = node.multiword_token.words + mwt_form = node.multiword_token.form + if node.multiword_token.misc: + mwt_misc.update(node.multiword_token.misc) + node.multiword_token.remove() + mwt_words[mwt_words.index(node):mwt_words.index(node)+1] = nodes + nodes = mwt_words + + mwt = node.root.create_multiword_token(nodes, mwt_form, mwt_misc) self.postprocess_mwt(mwt) def multiword_analysis(self, node): diff --git a/udapi/block/ud/addpuncttype.py b/udapi/block/ud/addpuncttype.py new file mode 100644 index 00000000..f5f20e06 --- /dev/null +++ b/udapi/block/ud/addpuncttype.py @@ -0,0 +1,91 @@ +""" +Some UD treebanks use features PunctType and PunctSide that classify +punctuation symbols. This block can be used to add such features to data where +they are missing – the classification is mostly deterministic. If the input +data already contains such features, their values will be overwritten. +""" +from udapi.core.block import Block + +# TODO We need to know the language, there are many other quotation styles, +# e.g. Finnish and Swedish uses the same symbol for opening and closing: ”X”. +# Danish uses uses the French quotes, but switched: »X«. + +PUNCT_TYPES = { + '(': 'Brck', + ')': 'Brck', + '[': 'Brck', + ']': 'Brck', + '{': 'Brck', + '}': 'Brck', + '.': 'Peri', + '...': 'Elip', + '…': 'Elip', + ',': 'Comm', + ';': 'Semi', + ':': 'Colo', + '!': 'Excl', + '¡': 'Excl', # Spanish initial exclamation mark + '?': 'Qest', + '¿': 'Qest', # Spanish initial question mark + '/': 'Colo', # it is used this way in AnCora + '-': 'Dash', + '–': 'Dash', + '—': 'Dash', + '"': 'Quot', + "'": 'Quot', + '`': 'Quot', + '“': 'Quot', # opening English, closing Czech + '”': 'Quot', # closing English + '„': 'Quot', # opening Czech + '‘': 'Quot', # opening English, closing Czech + '’': 'Quot', # closing English + '‚': 'Quot', # opening Czech + '«': 'Quot', # opening French, closing Danish + '»': 'Quot', # closing French, opening Danish + '‹': 'Quot', + '›': 'Quot', + '《': 'Quot', # Korean, Chinese + '》': 'Quot', + '「': 'Quot', # Chinese, Japanese + '」': 'Quot', + '『': 'Quot', + '』': 'Quot' +} + +PUNCT_SIDES = { + '(': 'Ini', + ')': 'Fin', + '[': 'Ini', + ']': 'Fin', + '{': 'Ini', + '}': 'Fin', + '¡': 'Ini', # Spanish initial exclamation mark + '!': 'Fin', # but outside Spanish people may expect empty value + '¿': 'Ini', # Spanish initial question mark + '?': 'Fin', + '《': 'Ini', # Korean, Chinese + '》': 'Fin', + '「': 'Ini', # Chinese, Japanese + '」': 'Fin', + '『': 'Ini', + '』': 'Fin' +} + + +class AddPunctType(Block): + """Add features PunctType and PunctSide where applicable.""" + + def process_node(self, node): + # The two features apply only to PUNCT. If they already occur elsewhere, erase them. + if node.upos != 'PUNCT': + node.feats['PunctType'] = '' + node.feats['PunctSide'] = '' + else: + if node.form in PUNCT_TYPES: + node.feats['PunctType'] = PUNCT_TYPES[node.form] + else: + node.feats['PunctType'] = '' + if node.form in PUNCT_SIDES: + node.feats['PunctSide'] = PUNCT_SIDES[node.form] + else: + node.feats['PunctSide'] = '' diff --git a/udapi/block/ud/ar/fixedeprels.py b/udapi/block/ud/ar/fixedeprels.py new file mode 100644 index 00000000..99db7fa2 --- /dev/null +++ b/udapi/block/ud/ar/fixedeprels.py @@ -0,0 +1,604 @@ +"""Block to fix case-enhanced dependency relations in Arabic.""" +from udapi.core.block import Block +import re + +class FixEdeprels(Block): + + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'jako' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'أَنَّ': [], + 'أَن': [], + 'إِنَّ': [], + 'إِذَا': [], + 'لَو': [], + 'حَيثُ': [], + 'مِثلَ': [], + 'لِأَنَّ': [], + 'كَمَا': [], +# 'فِي_حِينَ': [], + 'فَ': [] + } + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'اِبتِدَاء_مِن': 'مِن:gen', + 'إِثرَ': 'إِثرَ:gen', # ʾiṯra = right after + 'أَثنَاءَ': 'أَثنَاءَ:gen', # ʾaṯnāʾa = during + 'إِذ': 'إِذ', # ʾiḏ = because + 'إِذ_أَنَّ': 'إِذ', # ʾiḏ ʾanna + 'إِذًا': 'إِذَا', + 'إِذَا': 'إِذَا', # remove morphological case; ʾiḏā = if + 'إِزَاءَ': 'إِزَاءَ:gen', # ʾizāʾa = regarding, facing, towards + 'أَلَّا': 'إِلَّا', + 'إِلَّا': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_إِذَا': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_أَن': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_أَنَّ': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_أَنَّ_هُوَ': 'إِلَّا', # ʾillā = except, unless + 'إِلَى': 'إِلَى:gen', # ʾilā = to + 'إِلَى_أَن': 'إِلَى:gen', + 'إِلَى_أَنَّ': 'إِلَى_أَنَّ', # until? that? + 'إِلَى_أَنَّ_لَدَى': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_مِن': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_هُوَ': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_هُوَ_مِن': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_هُوَ_مِن_بَينَ': 'إِلَى_أَنَّ', + 'إِلَى_بَعدَ': 'إِلَى:gen', + 'إِلَى_بَينَ': 'إِلَى_بَينِ:gen', # ʾilā bayni = to between + 'إِلَى_جَانِب': 'إِلَى_جَانِبِ:gen', # ʾilā ǧānibi = beside + 'إِلَى_حَوَالَى': 'إِلَى:gen', # ila hawala = to around X + 'إِلَى_حَوَالَى_مِن': 'إِلَى:gen', # ila hawala min + 'إِلَى_حَيثُ': 'إِلَى:gen', + 'إِلَى_حِينَ': 'فِي_حِينِ', # during + 'إِلَى_خَارِجَ': 'إِلَى_خَارِجِ:gen', # ʾilā ḫāriǧi = out + 'إِلَى_فِي': 'إِلَى:gen', + 'إِلَى_قَبلَ': 'إِلَى_قَبلِ:gen', # ʾilā qabli = until before X (e.g. until one year ago) + 'إِلَى_مِثلَ': 'مِثلَ', # miṯla = like + 'إِلَى_نَحوَ': 'إِلَى:gen', # to about N + 'أَمَّا': 'أَمَامَ:gen', + 'إِمَّا_لِ': 'لِ:gen', + 'أَمَامَ': 'أَمَامَ:gen', # ʾamāma = in front of + 'أَمَامَ_مِن': 'أَمَامَ:gen', + 'أَن': 'أَنَّ', # remove morphological case; ʾanna = that + 'أَنَّ': 'أَنَّ', # remove morphological case; ʾanna = that + 'إِن': 'إِنَّ', # remove morphological case; ʾinna = that + 'إِنَّ': 'إِنَّ', # remove morphological case; ʾinna = that + 'إِنَّمَا': 'إِنَّ', + 'إِيَّا': 'إِلَّا', + 'بِ': 'بِ:gen', # bi = for, with + 'بِ_اِتِّجَاه': 'بِاِتِّجَاهِ:gen', # bi-ittiǧāhi = towards + 'بِ_إِزَاءَ': 'إِزَاءَ:gen', # ʾizāʾa = regarding, facing, towards + 'بِ_اِستِثنَاء': 'بِاِستِثنَاءِ:gen', # biistiṯnāʾi = with exception of + 'بِ_اِسم': 'بِاِسمِ:gen', # biismi = in name of + 'بِ_إِضَافَة_إِلَى': 'بِاَلإِضَافَةِ_إِلَى:gen', # bi-al-ʾiḍāfati ʾilā = in addition to + 'بِ_إِضَافَة_إِلَى_أَنَّ': 'إِلَى_أَنَّ', + 'بِ_إِضَافَة_لِ': 'بِاَلإِضَافَةِ_إِلَى:gen', # in addition to + 'بِ_اِعتِبَار': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to + 'بِ_اِعتِبَار_أَنَّ': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to + 'بِ_اِعتِبَار_مِن': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to + 'بِ_اِعتِمَاد_عَلَى': 'بِاَلِاعتِمَادِ_عَلَى:gen', # bi-al-i-ʼʿtimādi ʿalā = depending on + 'بِ_إِلَى': 'بِ:gen', + 'بِ_أَنَّ': 'أَنَّ', # that + 'بِ_أَن': 'بِ:gen', + 'بِ_إِنَّ': 'بِ:gen', + 'بِ_أَنَّ_أَمَامَ': 'أَنَّ', # that + 'بِ_أَنَّ_لَا': 'أَنَّ', # that + 'بِ_أَنَّ_مِن': 'أَنَّ', # that + 'بِ_أَنَّ_هما_مِن': 'أَنَّ', # that + 'بِ_أَنَّ_هُوَ': 'أَنَّ', # that + 'بِ_أَنَّ_هُوَ_عَلَى': 'أَنَّ', # that + 'بِ_اِنطِلَاق': 'بِ:gen', + 'بِ_تَالِي_إِنَّ': 'بِ:gen', + 'بِ_تَعَاوُن_مَعَ': 'بِاَلتَّعَاوُنِ_مَعَ:gen', # bi-at-taʿāwuni maʿa = in cooperation with + 'بِ_تُهمَة': 'بِتُهمَةِ:gen', # bituhmati = on charges of + 'بِ_تَوَازِي_مَعَ': 'بِاَلتَّوَازِي_مَعَ:gen', # bi-at-tawāzī maʿa = in parallel with + 'بِ_ثُمَّ': 'بِ:gen', + 'بِ_جَانِب': 'بِجَانِبِ:gen', # biǧānibi = next to + 'بِ_جِهَة': 'بِ:gen', + 'بِ_حَالَة': 'فِي_حَالِ:gen', # fī ḥāli = in case + 'بِ_حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'بِ_حُضُور': 'فِي_حُضُورِ:gen', # together with + 'بِ_حَقّ': 'بِ:gen', + 'بِ_حُكم': 'بِ:gen', + 'بِ_حُلُول': 'بِ:gen', + 'بِ_حَوَالَى': 'بِ:gen', # bi hawala = with around X + 'بِ_حَيثُ': 'بِ:gen', + 'بِ_خُصُوص': 'بِخُصُوصِ:gen', # biḫuṣūṣi = with regard + 'بِ_خِلَاف': 'بِخِلَافِ:gen', # biḫilāfi = in addition to + 'بِ_دَاخِلَ': 'دَاخِلَ:gen', + 'بِ_دَعوَى': 'بِ:gen', + 'بِ_دَور': 'بِ:gen', # bidawri = with role, in turn? + 'بِ_دُون': 'دُونَ:gen', + 'بِ_دُونَ': 'دُونَ:gen', # bi dūni = without + 'بِ_دُونَ_أَن': 'دُونَ:gen', # bi dūni ʾan = without + 'بِ_رِعَايَة': 'بِ:gen', + 'بِ_رَغم': 'رَغمَ:gen', # despite + 'بِ_رَغم_أَنَّ': 'رَغمَ:gen', # despite + 'بِ_رَغم_مِن': 'رَغمَ:gen', # despite + 'بِ_رَغم_مِن_أَن': 'بِ:gen', + 'بِ_رَغم_مِن_أَنَّ': 'رَغمَ:gen', # despite + 'بِ_رَغم_مِن_أَنَّ_هُوَ': 'بِ:gen', + 'بِ_رِفقَة': 'بِرِفقَةٍ:gen', # birifqatin = in company of + 'بِ_رِئَاسَة': 'بِ:gen', + 'بِ_سَبّ': 'بِ:gen', + 'بِ_سَبَب': 'بِسَبَبِ:gen', # bisababi = because of + 'بِ_شَأن': 'بِشَأنِ:gen', # bišaʾni = about, regarding (lit. with + matter) + 'بِ_شَرط_أَن': 'بِ:gen', + 'بِ_صَدَد': 'بِصَدَدِ:gen', # biṣadadi = with respect to + 'بِ_صَرف_نَظَر_عَن': 'بِصَرفِ_اَلنَّظَرِ_عَن:gen', # biṣarfi an-naẓari ʿan = regardless of + 'بِ_صِفَة': 'بِصِفَةِ:gen', # biṣifati = as + 'بِ_عَكس': 'بِ:gen', + 'بِ_عَلَى': 'بِ:gen', + 'بِ_عَن': 'بِ:gen', + 'بِ_عَين': 'بِ:gen', + 'بِ_غَضّ_نَظَر_عَن': 'بِغَضِّ_اَلنَّظَرِ_عَن:gen', # biġaḍḍi an-naẓari ʿan = regardless of + 'بِ_فَضل': 'بِفَضلِ:gen', # bifaḍli = thanks to + 'بِ_فِي': 'بِ:gen', + 'بِ_قَدر': 'بِ:gen', + 'بِ_قُرب_مِن': 'بِاَلقُربِ_مِن:gen', # bi-al-qurbi min = near (with proximity to) + 'بِ_قَصد': 'بِقَصدِ:gen', # biqaṣdi = with intention + 'بِ_كَ': 'بِ:gen', + 'بِ_لِ': 'بِ:gen', + 'بِ_لَا': 'بِ:gen', + 'بِ_مَا_أَنَّ': 'بِ:gen', + 'بِ_مَثَابَة': 'بِ:gen', + 'بِ_مِثلَ': 'مِثلَ', # miṯla = like + 'بِ_مُجَرَّد': 'بِ:gen', + 'بِ_مُسَاعَدَة': 'بِ:gen', + 'بِ_مُشَارَكَة': 'بِمُشَارَكَةِ:gen', # bimušārakati = with participation of + 'بِ_مُقَارَنَة_بِ': 'بِاَلمُقَارَنَةِ_بِ:gen', # bi-al-muqāranati bi = in comparison to + 'بِ_مُقتَضَى': 'بِمُقتَضَى:gen', # bimuqtaḍā = with requirement of + 'بِ_مِقدَار': 'بِ:gen', + 'بِ_مِن': 'بِ:gen', + 'بِ_مُنَاسَبَة': 'بِمُنَاسَبَةِ:gen', # bimunāsabati = on the occasion of + 'بِ_مُوجِب': 'بِمُوجِبِ:gen', # bimūǧibi = with motive + 'بِ_نَتِيجَة': 'بِ:gen', + 'بِ_نَحوَ': 'بِ:gen', # by about N + 'بِ_نِسبَة': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati (bin-nisbati) = in proportion/relation to + 'بِ_نِسبَة_إِلَى': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati ʾilā (bin-nisbati ʾilā) = in proportion/relation to + 'بِ_نِسبَة_لِ': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati li (bin-nisbati li) = in proportion/relation to + 'بِ_نِسبَة_لِ_مِن': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati li (bin-nisbati li) = in proportion/relation to + 'بِ_نَظَر_إِلَى': 'بِ:gen', + 'بِ_نِيَابَة_عَن': 'بِاَلنِّيَابَةِ_عَن:gen', # bi-an-niyābati ʿan = on behalf of + 'بِ_هَدَف': 'بِهَدَفِ:gen', # bihadafi = with goal + 'بِ_وَ_لِ': 'بِ:gen', + 'بِ_وَاسِطَة': 'بِوَاسِطَةِ:gen', # biwāsiṭati = by means of + 'بِ_وَاقِع': 'بِ:gen', + 'بِ_وَسَط': 'بِوَسَطِ:gen', # biwasaṭi = in the middle of + 'بِ_وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle + 'بِ_وَصف': 'بِ:gen', + 'بازاء': 'بِ:gen', + 'بالتسخين': 'بِ:gen', + 'بَدَلًا_مِن': 'بَدَلًا_مِن:gen', # badalan min = instead of + 'بدون': 'دُونَ:gen', # without + 'بشان': 'بِشَأنِ:gen', + 'بَعدَ': 'بَعدَ:gen', # baʿda = after + 'بَعدَ_أَن': 'بَعدَ:gen', # baʿda ʾan = after + clause + 'بَعدَ_حَوَالَى': 'بَعدَ:gen', # baada hawala + 'بَعدَ_نَحوَ': 'بَعدَ:gen', # after about N + 'بَعدَمَا': 'بَعدَ:gen', # baʿdamā = after + 'بُعَيدَ': 'بُعَيدَ:gen', # buʿayda = shortly after + 'بَل': 'قَبلَ:gen', + 'بِنَاء_عَلَى': 'بناء_عَلَى:gen', + 'بناء_عَلَى': 'بناء_عَلَى:gen', # bnāʾ ʿalā = based on + 'بناء_لِ': 'لِ:gen', + 'بَيدَ': 'بِ:gen', + 'بَيدَ_أَنَّ': 'بِ:gen', + 'بَينَ': 'بَينَ:gen', # bayna = between + 'بَينَ_حَوَالَى': 'بَينَ:gen', # bayna hawala + 'بينا': 'بَينَ:gen', # bayna = between + 'بَينَمَا': 'بَينَ:gen', + 'بَينَمَا_لَم': 'بَينَ:gen', + 'تُجَاهَ': 'تُجَاهَ:gen', # tuǧāha = towards, facing + 'تَحتَ': 'تَحتَ:gen', # tahta = under + 'ثَمَّ': 'بِ:gen', + 'ثُمَّ': 'بِ:gen', + 'جَرَّاء': 'جَرَّاء:gen', # ǧarrāʾ = because of + 'حَتَّى': 'حَتَّى:gen', # ḥattā = until + 'حَتَّى_أَنَّ': 'حَتَّى:gen', # before + 'حَتَّى_إِنَّ': 'حَتَّى:gen', # before + 'حَتَّى_بِ': 'حَتَّى:gen', # before + 'حَتَّى_لَو': 'لَو', # even if + 'حَتَّى_وَ_لَو': 'لَو', # even if + 'حَتَّى_وإن': 'إِنَّ', + 'حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'حَسَبَمَا': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'حَوَالَى': 'حَوَالَى', # ḥawālā = around, about + 'حَوَالَى_مِن': 'مِن:gen', # hawala min = from around X + 'حَولَ': 'حَولَ:gen', # ḥawla = about + 'حولما_إِذَا': 'إِذَا', + 'حِيَالَ': 'حِيَالَ:gen', # ḥiyāla = concerning + 'حَيثُ': 'حَيثُ', # remove morphological case; ḥayṯu = where (SCONJ, not ADV) + 'حِينَمَا': 'فِي_حِينِ', # during + 'خَارِجَ': 'خَارِجَ:gen', # ḫāriǧa = outside + 'خِلَالَ': 'خِلَالَ:gen', # ḫilāla = during + 'خَلفَ': 'خَلفَ:gen', # ḫalfa = behind + 'دَاخِل': 'دَاخِلَ:gen', # dāḫila = inside of + 'دَاخِلَ': 'دَاخِلَ:gen', # dāḫila = inside of + 'دُونَ': 'دُونَ:gen', # dūna = without + 'دُونَ_أَن': 'دُونَ:gen', # dūna ʾan = without + 'دُونَ_سِوَى': 'دُونَ:gen', # dūna siwā = without + 'دونما': 'دُونَ:gen', + 'ذٰلِكَ_بَعدَمَا': 'بَعدَ:gen', + 'ذٰلِكَ_عِندَمَا': 'بِ:gen', + 'ذٰلِكَ_لِأَنَّ': 'لِأَنَّ', # because + 'ذٰلِكَ_لِكَي': 'لِكَي', # li-kay = in order to + 'ذٰلِكَ_نَظَر_لِ': 'بِ:gen', + 'رَغمَ': 'رَغمَ:gen', # raġma = despite + 'رَغمَ_أَنَّ': 'رَغمَ:gen', # raġma ʾanna = despite + clause + 'رَغمَ_أَنَّ_مِن': 'رَغمَ:gen', # raġma ʾanna min = despite + 'رَهنَ': 'رَهنَ:gen', # rahna = depending on + 'رَيثَمَا': 'رَهنَ:gen', # rahna = depending on + 'سِوَى': 'سِوَى:gen', # siwā = except for + 'سِوَى_أَنَّ_هُوَ': 'سِوَى:gen', # siwā = except for + 'سِوَى_بِ': 'سِوَى:gen', # siwā = except for + 'سِوَى_عَلَى': 'سِوَى:gen', # siwā = except for + 'سِوَى_لِ': 'سِوَى:gen', # siwā = except for + 'ضِدَّ': 'ضِدَّ:gen', # ḍidda = against + 'ضِمنَ': 'ضِمنَ:gen', # ḍimna = within, inside, among + 'طَالَمَا': 'طَالَمَا', # ṭālamā = as long as + 'طالَما': 'طَالَمَا:gen', + 'طَالَمَا_أَنَّ': 'طَالَمَا', # ṭālamā = as long as + 'طِوَالَ': 'طِوَالَ:gen', # ṭiwāla = throughout + 'طِيلَةَ': 'طِيلَةَ:gen', # ṭīlata = during + 'عبر': 'عَبرَ:gen', + 'عَبرَ': 'عَبرَ:gen', # ʿabra = via + 'عَدَا': 'عَدَا:gen', # ʿadā = except for + 'عَقِبَ': 'عَقِبَ:gen', # ʿaqiba = following + 'عَقِبَ_أَن': 'عَقِبَ:gen', # ʿaqiba = following + 'عَقِبَ_مِن': 'عَقِبَ:gen', # ʿaqiba = following + 'عَلَى': 'عَلَى:gen', # ʿalā = on + 'عَلَى_أبواب': 'عَلَى:gen', + 'عَلَى_إِثرَ': 'إِثرَ:gen', # ʿalā ʾiṯri = right after + 'عَلَى_أَثَر': 'عَلَى:gen', + 'عَلَى_اِختِلَاف': 'عَلَى:gen', + 'عَلَى_أَسَاس': 'عَلَى_أَسَاسٍ:gen', # ʿalā ʾasāsin = based on + 'عَلَى_أَسَاس_أَنَّ': 'عَلَى_أَسَاسٍ:gen', # ʿalā ʾasāsin = based on + 'عَلَى_اِعتِبَار_أَنَّ': 'عَلَى_اِعتِبَارِ_أَنَّ', # ʿalā iʿtibāri ʾanna = considering that + 'عَلَى_إِلَّا': 'إِلَّا', # ʾillā = except, unless + 'عَلَى_الفور': 'عَلَى:gen', + 'عَلَى_إِلَى': 'عَلَى:gen', + 'عَلَى_أَن': 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ': 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَن_بِ': 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_مِن_شَأن': 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_هُوَ': 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_هُوَ_لَدَى': 'عَلَى:gen', # ʿalā = on + 'عَلَى_بِ': 'عَلَى:gen', + 'عَلَى_بِ_فِي': 'عَلَى:gen', + 'عَلَى_بَينَ': 'عَلَى:gen', + 'عَلَى_حَدّ': 'عَلَى:gen', + 'عَلَى_حِسَاب': 'عَلَى_حِسَابِ:gen', # ʿalā ḥisābi = at the expense of + 'عَلَى_حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'عَلَى_حَولَ': 'عَلَى:gen', + 'عَلَى_رَأس': 'عَلَى_رَأسِ:gen', # ʿalā raʾsi = on top of + 'عَلَى_رَغم': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغمَ_أَنَّ': 'رَغمَ:gen', # ʿalā raġma ʾanna = despite + clause + 'عَلَى_رَغم_أَنَّ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغم_مِن': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغم_مِن_أَنَّ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغم_مِن_أَنَّ_هُوَ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_طَرِيقَة': 'عَلَى_طَرِيقَةِ:gen', # ʿalā ṭarīqati = on the way + 'عَلَى_عَكس': 'عَلَى:gen', + 'عَلَى_غِرَار': 'عَلَى_غِرَارِ:gen', # ʿalā ġirāri = similar to + 'عَلَى_قَيد': 'عَلَى:gen', + 'عَلَى_لِسَان': 'عَلَى:gen', + 'عَلَى_مِثلَ': 'مِثلَ', # miṯla = like + 'عَلَى_مدى': 'عَلَى:gen', + 'عَلَى_مَدَى': 'عَلَى_مَدَى:gen', # ʿalā madā = on period + 'عَلَى_مَقرَبَة_مِن': 'عَلَى_مَقرَبَةٍ_مِن:gen', # ʿalā maqrabatin min = in the vicinity of + 'عَلَى_مِن': 'عَلَى:gen', + 'عَلَى_نَحوَ': 'عَلَى:gen', # to about N + 'عَلَى_يَد': 'عَلَى:gen', + 'عَن': 'عَن:gen', # ʿan = about, from + 'عَن_أَن': 'عَن:gen', + 'عَن_أَنَّ': 'عَن:gen', + 'عَن_أَنَّ_وَرَاءَ': 'وَرَاءَ:gen', # warāʾa = behind, past, beyond + 'عَن_بِ': 'عَن:gen', + 'عَن_طَرِيق': 'عَن_طَرِيقِ:gen', # ʿan ṭarīqi = via + 'عَن_فِي_أَن': 'عَن:gen', + 'عَن_قُربَ': 'قُربَ:gen', # qurba = near + 'عَن_مِثلَ': 'مِثلَ', # miṯla = like + 'عَن_مِن': 'عَن:gen', + 'عِندَ': 'عِندَمَا', # ʿinda = when + 'عِندَمَا': 'عِندَمَا', # ʿindamā = when + 'غَيرَ': 'إِلَّا', + 'فَ': 'فَ', # fa = so (advcl or coordination) + 'فَ_إِذَا': 'فَ', # fa = so (advcl or coordination) + 'فَ_بَدَل_مِن_أَن': 'فَ', # fa = so (advcl or coordination) + 'فَ_بَينَ': 'فَ', # fa = so (advcl or coordination) + 'فَ_عَلَى': 'فَ', # fa = so (advcl or coordination) + 'فَ_فِي': 'فَ', # fa = so (advcl or coordination) + 'فَ_مِن': 'فَ', # fa = so (advcl or coordination) + 'فَورَ': 'فَورَ:gen', # fawra = as soon as + 'فَوقَ': 'فَوقَ:gen', # fawqa = above, over + 'فِي': 'فِي:gen', # fī = in + 'فِي_اِتِّجَاه': 'بِاِتِّجَاهِ:gen', # bi-ittiǧāhi = towards + 'فِي_أَثنَاءَ': 'أَثنَاءَ:gen', # ʾaṯnāʾa = during + 'فِي_إِطَار': 'فِي_إِطَار:gen', # fī ʾiṭār = in frame + 'فِي_اعقاب': 'فِي_أَعقَابِ:gen', + 'فِي_إِلَى': 'فِي:gen', + 'فِي_أَن': 'فِي:gen', + 'فِي_أَنَّ': 'فِي:gen', + 'فِي_أَنَّ_عَلَى': 'فِي:gen', + 'فِي_أَنَّ_لَدَى': 'فِي:gen', + 'فِي_أَنَّ_مِن': 'فِي:gen', + 'فِي_بِ': 'فِي:gen', + 'فِي_بِ_فِي': 'فِي:gen', + 'فِي_بَاطِن': 'فِي:gen', + 'فِي_بَعدَ': 'فِي:gen', + 'فِي_بَينَ': 'بَينَ:gen', + 'فِي_حَال': 'فِي_حَالِ:gen', # fī ḥāli = in case + 'فِي_حَالَة': 'فِي_حَالِ:gen', # fī ḥāli = in case + 'فِي_حَدّ': 'فِي:gen', + 'فِي_حُضُور': 'فِي_حُضُورِ:gen', # fī ḥuḍūri = in presence of + 'فِي_حَقّ': 'فِي:gen', + 'فِي_حُكم': 'فِي:gen', + 'فِي_حَوَالَى': 'فِي:gen', # fi hawala = in around X + 'فِي_حِين': 'فِي_حِينِ', # fī ḥīni = while + 'فِي_حِينَ': 'فِي_حِينِ', # fī ḥīni = while + 'فِي_حِين_أَنَّ': 'فِي_حِينِ', + 'فِي_خَارِجَ': 'خَارِجَ:gen', # ḫāriǧa = outside + 'فِي_خِتَام': 'فِي_خِتَامِ:gen', # fī ḫitāmi = in conclusion + 'فِي_خِتَامِ': 'فِي_خِتَامِ:gen', # fī ḫitāmi = in conclusion + 'فِي_خِلَالَ': 'فِي:gen', + 'فِي_دَاخِل': 'دَاخِل:gen', + 'فِي_دَاخِلَ': 'فِي:gen', + 'فِي_سَبِيل': 'فِي_سَبِيلِ:gen', # fī sabīli = in order to + 'فِي_سِيَاق': 'فِي:gen', + 'فِي_شَأن': 'فِي_شَأنِ:gen', # fī šaʾni = in regard of + 'فِي_شَكل': 'فِي:gen', + 'فِي_صَفّ': 'فِي:gen', + 'فِي_صُورَة': 'فِي:gen', + 'فِي_ضَوء': 'فِي_ضَوءِ:gen', # fī ḍawʾi = in light of + 'فِي_ظِلّ': 'فِي_ظِلِّ:gen', # fī ẓilli = in light of + 'فِي_عُقب': 'فِي_أَعقَابِ:gen', # fī ʾaʿqābi = in the aftermath of + 'فِي_غَضن': 'فِي:gen', + 'فِي_غُضُون': 'فِي:gen', + 'فِي_مَا': 'فِي:gen', + 'فِي_مِثلَ': 'مِثلَ', # miṯla = like + 'فِي_مَجَال': 'فِي_مَجَالِ:gen', # fī maǧāli = in the area of + 'فِي_مستشفى': 'فِي:gen', + 'فِي_مَعَ': 'فِي:gen', + 'فِي_مُقَابِلَ': 'مُقَابِلَ:gen', + 'فِي_مَقدَم': 'فِي:gen', + 'فِي_مِن': 'فِي:gen', + 'فِي_مُنَاسَبَة': 'فِي_مُنَاسَبَةِ:gen', # fī munāsabati = on the occasion of + 'فِي_مُوَاجَهَة': 'فِي:gen', + 'فِي_نَحوَ': 'فِي:gen', # in about N + 'فِي_نِطَاق': 'فِي:gen', + 'فِي_وَجه': 'فِي:gen', + 'فِي_وَسط': 'وَسطَ:gen', + 'فِي_وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle + 'فِيمَا': 'فِيمَا', # fīmā = while + 'قُبَالَةَ': 'قُبَالَةَ:gen', # qubālata = in front of, facing + 'قَبلَ': 'قَبلَ:gen', # qabla = before + 'قَبلَ_أَن': 'قَبلَ:gen', # qabla = before + 'قَبلَ_حَوَالَى': 'قَبلَ:gen', # qabla hawala + 'قَبلَ_نَحوَ': 'قَبلَ:gen', # before about N + 'قُبَيلَ': 'قُبَيلَ:gen', # qubayla = before + 'قُربَ': 'قُربَ:gen', # qurba = near + 'قَيدَ': 'فِي:gen', + 'كَ': 'كَ:gen', # ka = in (temporal?) + 'كَ_أَنَّ': 'كَ:gen', + 'كَ_لِ': 'كَ:gen', + 'كَ_وَ_وَ': 'كَ:gen', + 'كَأَنَّمَا': 'كَأَنَّمَا', # ka-ʾannamā = as if + 'كُلَّمَا': 'كُلَّمَا', # kullamā = whenever + 'كَمَا': 'كَمَا', # remove morphological case; kamā = as + 'كَي': 'لِكَي', # kay = in order to + 'لَ': 'لِ:gen', + 'لَ_عَلَّ': 'لِ:gen', + 'لِ': 'لِ:gen', # li = to + 'لِ_أَجَلّ': 'لِ:gen', + 'لِ_إِلَى': 'لِ:gen', + 'لِ_أَمَامَ_وَ': 'لِ:gen', + 'لِ_أَن': 'لِ:gen', + 'لِ_بِ': 'لِ:gen', + 'لِ_جِهَة': 'لِ:gen', + 'لِ_حِسَاب': 'عَلَى_حِسَابِ:gen', # ʿalā ḥisābi = at the expense of + 'لِ_حَوَالَى': 'لِ:gen', # li hawala = for around X + 'لِ_خَارِجَ': 'لِخَارِجِ:gen', # liḫāriǧi = out + 'لِ_دُخُول': 'لِ:gen', + 'لِ_دَرَجَة_أَنَّ': 'لِ:gen', + 'لِ_سَبَب': 'لِ:gen', + 'لِ_صَالِح': 'لِصَالِحِ:gen', # liṣāliḥi = in interest of + 'لِ_عَلَى': 'لِ:gen', + 'لِ_عَن': 'لِ:gen', + 'لِ_عِندَ': 'لِ:gen', + 'لِ_فِي': 'لِ:gen', + 'لِ_فِي_بَينَ': 'لِ:gen', + 'لِ_كَون': 'لِكَونِ', # likawni = because + 'لِ_لِئَلّا': 'لِ:gen', + 'لِ_مِثلَ': 'مِثلَ', # miṯla = like + 'لِ_مَعَ': 'لِ:gen', + 'لِ_مِن': 'لِ:gen', + 'لِ_نَحوَ': 'لِ:gen', # to/for about N + 'لِ_وَ': 'لِ:gen', + 'لِ_وَ_فِي': 'لِ:gen', + 'لَا': 'إِلَّا', + 'لَا_سِيَّمَا_بَعدَ': 'بَعدَ:gen', + 'لَا_سِيَّمَا_وَ_أَنَّ': 'أَنَّ', + 'لَا_سِيَّمَا_وَ_أَنَّ_هُوَ': 'أَنَّ', + 'لِأَنَّ': 'لِأَنَّ', # remove morphological case; li-ʾanna = because + 'لدى': 'لَدَى:gen', + 'لَدَى': 'لَدَى:gen', # ladā = with, by, of, for + 'لِذَا': 'لِذَا', # liḏā = so, therefore + 'لِذَا_فَ': 'لِ:gen', + 'لِذٰلِكَ': 'لِذَا', # liḏā = so, therefore + 'لٰكِنَّ': 'مَعَ:gen', + 'لكن_إِذَا': 'إِذَا', + 'لكن_بِ': 'بِ:gen', + 'لٰكِن_بَعدَ': 'بَعدَ:gen', + 'لكن_دَاخِلَ': 'دَاخِلَ:gen', + 'لكن_لَدَى': 'لَدَى:gen', + 'لٰكِن_مَعَ': 'مَعَ:gen', + 'لِكَي': 'لِكَي', # li-kay = in order to + 'لَمَّا': 'كُلَّمَا', + 'لَمَّا_لِ': 'كُلَّمَا', + 'لَو': 'لَو', # law = if + 'لَو_أَنَّ': 'لَو', # if + 'لَو_مِن': 'لَو', # if + 'ما': 'مِمَّا', + 'مَا': 'مِمَّا', + 'ما_دَام': 'مِمَّا', + 'مادامت': 'مِمَّا', + 'مَالَم': 'مَالَم', # mālam = unless + 'مِثلَ': 'مِثلَ', # remove morphological case; miṯla = like + 'مِثلَمَا': 'مِثلَ', # miṯla = like + 'مَعَ': 'مَعَ:gen', # maʿa = with + 'مَعَ_أَنَّ': 'مَعَ:gen', + 'مَعَ_بِ': 'مَعَ:gen', + 'مَعَ_فِي': 'مَعَ:gen', + 'مَعَ_مِن_بَينَ': 'بَينَ:gen', + 'مقابل': 'مُقَابِلَ:gen', + 'مُقَابِلَ': 'مُقَابِلَ:gen', # muqābila = in exchange for, opposite to, corresponding to + 'مُقَابِلَ_حَوَالَى': 'مُقَابِلَ:gen', # muqabila hawala + 'مُقَارَن_بِ': 'بِ:gen', + 'مِمَّا': 'مِمَّا', # mimmā = that, which + 'مِمَّا_لَدَى': 'مِمَّا', # mimmā = that, which + 'مِن': 'مِن:gen', # min = from + 'مِن_اجل': 'مِن_أَجلِ:gen', # min ʾaǧli = for the sake of + 'مِن_أَجل': 'مِن_أَجلِ:gen', # min ʾaǧli = for the sake of + 'مِن_أَجل_أَن': 'مِن:gen', + 'مِن_إِلَى': 'مِن:gen', + 'مِن_أَن': 'مِن:gen', + 'مِن_أَنَّ': 'مِن:gen', + 'مِن_بِ': 'مِن:gen', + 'مِن_بَعدَ': 'مِن:gen', + 'مِن_بَينَ': 'بَينَ:gen', + 'مِن_تَحتَ': 'مِن:gen', + 'مِن_ثَمَّ': 'مِن:gen', + 'مِن_ثُمَّ': 'مِن:gen', + 'مِن_جَانِب': 'إِلَى_جَانِبِ:gen', # min ǧānibi = beside + 'مِن_جَرَّاء': 'جَرَّاء:gen', # ǧarrāʾ = because of + 'مِن_حَوَالَى': 'مِن:gen', # min hawala = from around X + 'مِن_حَولَ': 'مِن:gen', + 'مِن_حَيثُ': 'مِن:gen', + 'مِن_خَارِج': 'مِن_خَارِجِ:gen', # min ḫāriǧi = from outside + 'مِن_خَارِجَ': 'مِن_خَارِجِ:gen', # min ḫāriǧi = from outside + 'مِن_خِلَالَ': 'مِن_خِلَالِ:gen', # min ḫilāli = through, during + 'مِن_دَاخِلَ': 'مِن_دَاخِلِ:gen', # min dāḫili = from inside + 'مِن_دُون': 'مِن_دُونِ:gen', # min dūni = without, beneath, underneath + 'مِن_دُونَ': 'مِن_دُونِ:gen', # min dūni = without, beneath, underneath + 'مِن_دُون_أَن': 'مِن_دُونِ:gen', + 'مِن_دُونَ_أَن': 'مِن_دُونِ:gen', # min dūni ʾan = without, beneath, underneath + clause + 'مِن_زَاوِيَة': 'مِن:gen', + 'مِن_شَأن': 'مِن_شَأنِ:gen', # min šaʾni = from matter + 'مِن_ضِمنَ': 'مِن_ضِمنِ:gen', # min ḍimni = from within = including + 'مِن_طَرَف': 'مِن:gen', + 'مِن_عَلَى': 'مِن:gen', + 'مِن_عِندَ': 'مِن:gen', + 'مِن_غَير_أَن': 'مِن:gen', + 'مِن_فَوقَ': 'مِن_فَوقِ:gen', # min fawqi = from above + 'مِن_فِي': 'مِن:gen', + 'مِن_قَبلَ': 'مِن_قِبَلِ:gen', + 'مِن_قِبَل': 'مِن_قِبَلِ:gen', # min qibali = by + 'مِن_قِبَل_بِ_فِي': 'مِن_قِبَلِ:gen', # min qibali = by + 'مِن_مِثلَ': 'مِثلَ', # miṯla = like + 'مِن_مِن': 'مِن:gen', + 'مِن_مِن_بَينَ': 'بَينَ:gen', + 'مِن_مَوقِع': 'مِن:gen', + 'مِن_نَاحِيَة': 'مِن:gen', + 'مِن_وَرَاءَ': 'مِن_وَرَاءِ:gen', # min warāʾi = from behind + 'مُنذُ': 'مُنذُ:gen', # munḏu = since + 'مُنذُ_أَن': 'مُنذُ:gen', + 'مُنذُ_نَحوَ': 'مُنذُ:gen', # since about N + 'مُنذُ_وَ_فِي': 'مُنذُ:gen', + 'مَهمَا': 'مَهمَا', # mahmā = regardless + 'نَاهِيك_بِ': 'بِ:gen', + 'نَتِيجَة_لِ': 'لِ:gen', + 'نَحوَ': 'نَحوَ', # naḥwa = about, approximately + 'نَحوَ_بِ': 'بِ:gen', # about by N + 'هذا_بالأضافة': 'بِ:gen', + 'وان': 'أَنَّ', + 'وإن': 'إِنَّ', + 'وبشان': 'بِشَأنِ:gen', + 'وَرَاءَ': 'وَرَاءَ:gen', # warāʾa = behind, past, beyond + 'وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle + 'وِفقَ': 'وِفقَ:gen', # wifqa = according to + 'ولو': 'إِذَا', # walaw = even if + 'ولو_أَنَّ': 'إِذَا' # walaw = even if + } + + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Czech basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) + if m: + solved = False + # Arabic clauses often start with وَ wa "and", which does not add + # much to the meaning but sometimes gets included in the enhanced + # case label. Remove it if there are more informative subsequent + # morphs. + edep['deprel'] = re.sub(r':وَ_', r':', edep['deprel']) + edep['deprel'] = re.sub(r':وَ:', r':', edep['deprel']) + edep['deprel'] = re.sub(r':وَ$', r'', edep['deprel']) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'jako_v' becomes just 'jako'. + re_prefix = r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):' + re_suffix = r'([_:].+)?$' + for x in self.outermost: + exceptions = self.outermost[x] + m = re.match(re_prefix + x + re_suffix, edep['deprel']) + if m and (not m.group(2) or not (x + m.group(2)) in exceptions): + edep['deprel'] = m.group(1)+':'+x + solved = True + break + if solved: + continue + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + if solved: + continue + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/ca/__init__.py b/udapi/block/ud/ca/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/ca/addmwt.py b/udapi/block/ud/ca/addmwt.py new file mode 100644 index 00000000..49b79da1 --- /dev/null +++ b/udapi/block/ud/ca/addmwt.py @@ -0,0 +1,194 @@ +"""Block ud.ca.AddMwt for heuristic detection of Catalan contractions. + +According to the UD guidelines, contractions such as "del" = "de el" +should be annotated using multi-word tokens. + +Note that this block should be used only for converting legacy conllu files. +Ideally a tokenizer should have already split the MWTs. +""" +import re +import udapi.block.ud.addmwt + +MWTS = { + 'al': {'form': 'a el', 'lemma': 'a el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'als': {'form': 'a els', 'lemma': 'a el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, + 'del': {'form': 'de el', 'lemma': 'de el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'dels': {'form': 'de els', 'lemma': 'de el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, + 'pel': {'form': 'per el', 'lemma': 'per el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'pels': {'form': 'per els', 'lemma': 'per el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, +} + +# shared values for all entries in MWTS +for v in MWTS.values(): + v['lemma'] = v['form'] + v['upos'] = 'ADP DET' + v['deprel'] = '* det' + # The following are the default values + # v['main'] = 0 # which of the two words will inherit the original children (if any) + # v['shape'] = 'siblings', # the newly created nodes will be siblings + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def __init__(self, verbpron=False, **kwargs): + super().__init__(**kwargs) + self.verbpron = verbpron + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + analysis = MWTS.get(node.form.lower(), None) + + if analysis is not None: + # Modify the default attachment of the new syntactic words in special situations. + if re.match(r'^(root|conj|reparandum)$', node.udeprel): + # Copy the dictionary so that we do not modify the original and do not affect subsequent usages. + analysis = analysis.copy() + analysis['shape'] = 'subtree' + return analysis + return None + + def fix_personal_pronoun(self, node): + # There is a mess in lemmas and features of personal pronouns. + if node.upos == 'PRON': + if re.match("^jo$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Nom|Number=Sing|Person=1|PronType=Prs' + if re.match("^(em|m'|-me|'m|me|m)$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc,Dat|Number=Sing|Person=1|PrepCase=Npr|PronType=Prs' + if re.match("^mi$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc|Number=Sing|Person=1|PrepCase=Pre|PronType=Prs' + if re.match("^tu$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Nom|Number=Sing|Person=2|Polite=Infm|PronType=Prs' + if re.match("^(et|t'|-te|'t|te|t)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc,Dat|Number=Sing|Person=2|Polite=Infm|PrepCase=Npr|PronType=Prs' + if re.match("^ti$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc|Number=Sing|Person=2|Polite=Infm|PrepCase=Pre|PronType=Prs' + # Strong forms of third person pronouns can be used as subjects or after preposition. + # Do not mark them as nominative (because of the prepositions). + if re.match("^ell$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^ella$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Fem|Number=Sing|Person=3|PronType=Prs' + if re.match("^(el|-lo|'l|lo)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^(la|-la)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs' + if re.match("^(l')$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^(ho|-ho)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs' + if re.match("^(li|-li)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Dat|Number=Sing|Person=3|PronType=Prs' + if re.match("^(es|s'|-se|'s|se|s)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc,Dat|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes' + if re.match("^si$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Person=3|PrepCase=Pre|PronType=Prs|Reflex=Yes' + # If nosaltres can be used after a preposition, we should not tag it as nominative. + if re.match("^nosaltres$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Number=Plur|Person=1|PronType=Prs' + # Nós is the majestic first person singular. In accusative and dative, it is identical to first person plural. + if re.match("^nós$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Number=Sing|Person=1|Polite=Form|PronType=Prs' + if re.match("^(ens|-nos|'ns|nos|ns)$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs' + if re.match("^vosaltres$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Plur|Person=2|PronType=Prs' + # Vós is the formal second person singular. In accusative and dative, it is identical to second person plural. + # Vostè is even more formal than vós. In accusative and dative, it is identical to third person singular. + if re.match("^(vós|vostè)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Sing|Person=2|Polite=Form|PronType=Prs' + if re.match("^vostès$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Plur|Person=2|Polite=Form|PronType=Prs' + if re.match("^(us|-vos|-us|vos)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs' + # Strong forms of third person pronouns can be used as subjects or after preposition. + # Do not mark them as nominative (because of the prepositions). + if re.match("^ells$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Masc|Number=Plur|Person=3|PronType=Prs' + if re.match("^elles$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Fem|Number=Plur|Person=3|PronType=Prs' + # Els is masculine accusative, or dative in any gender. + if re.match("^(els|-los|'ls|los|ls)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs' + if re.match("^(les|-les)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem|Number=Plur|Person=3|PronType=Prs' + # There are also "adverbial" pronominal clitics that can occur at direct object positions. + if re.match("^(en|n'|'n|-ne|n|ne)$", node.form, re.IGNORECASE): + node.lemma = 'en' + node.feats = 'Case=Gen|Person=3|PronType=Prs' + if re.match("^(hi|-hi)$", node.form, re.IGNORECASE): + node.lemma = 'hi' + node.feats = 'Case=Loc|Person=3|PronType=Prs' + + def report_suspicious_lemmas(self, node): + # There are offset issues of splitted multi_word_expressions. + # Sometimes a word gets the lemma of the neighboring word. + if node.form.lower()[:1] != node.lemma.lower()[:1]: + # Exclude legitimate cases where the lemma starts with a different letter. + hit = True + if node.lemma == 'jo' and re.match("(em|ens|m'|me|mi|nos|nosaltres|'ns)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'tu' and re.match("(et|'t|us|vosaltres|vostè)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'el' and re.match("(la|l|l'|les)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ell' and re.match("(hi|ho|'l|l'|la|-la|les|li|lo|-lo|los|'ls|'s|s'|se|-se|si)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'es' and re.match("(s|se)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'em' and re.match("('m|m|m')", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'en' and re.match("('n|n'|ne|-ne)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'anar' and re.match("(va|van|vàrem)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ser' and re.match("(és|era|eren|eres|érem|essent|estat|ets|foren|fos|fossin|fou)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'estar' and re.match("(sigut)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'caure' and re.match("(queia|queies|quèiem|quèieu|queien)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ampli' and re.match("(àmplia|àmplies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'indi' and re.match("(índies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'obvi' and re.match("(òbvia)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ossi' and re.match("(òssies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ús' and re.match("(usos)", node.form, re.IGNORECASE): + hit = False + # Form = '2001/37/CE', lemma = 'CE' + # Form = 'nº5', lemma = '5' + # Form = 'kg.', lemma = 'quilogram' + # Form = 'un', lemma = '1' + if node.lemma == 'CE' or re.match("nº", node.form, re.IGNORECASE) or re.match("^quil[oò]", node.lemma, re.IGNORECASE) or re.match("^[0-9]+$", node.lemma): + hit = False + if hit: + print("Form = '%s', lemma = '%s', address = %s" % (node.form, node.lemma, node.address())) diff --git a/udapi/block/ud/ca/elque.py b/udapi/block/ud/ca/elque.py new file mode 100644 index 00000000..6b3ad22b --- /dev/null +++ b/udapi/block/ud/ca/elque.py @@ -0,0 +1,116 @@ +""" +This block searches for relative clauses modifying a determiner ('el que...'). +It is written for Catalan but a similar block should work for Spanish and other +Romance languages. +""" +from udapi.core.block import Block +import logging +import re + +class ElQue(Block): + + def __init__(self, fix=False, **kwargs): + """ + Default: Print the annotation patterns but do not fix anything. + fix=1: Do not print the patterns but fix them. + """ + super().__init__(**kwargs) + self.fix = fix + + def process_node(self, node): + # We take 'que' as the central node of the construction. + if node.lemma == 'que' and node.upos == 'PRON' and node.parent.ord > node.ord: + # We will refer to the parent of 'que' as a verb, although it can be + # a non-verbal predicate, too. + que = node + verb = node.parent + # Check the lemma of the determiner. The form may vary for gender and number. + if que.prev_node and que.prev_node.lemma == 'el': + el = que.prev_node + adp = None + if el.prev_node and el.prev_node.upos == 'ADP': + adp = el.prev_node + if adp.udeprel == 'fixed': + adp = adp.parent + if self.fix: + self.fix_pattern(adp, el, que, verb) + else: + self.print_pattern(adp, el, que, verb) + + def print_pattern(self, adp, el, que, verb): + stanford = [] + if adp: + if adp.parent == el: + parentstr = 'el' + elif adp.parent == que: + parentstr = 'que' + elif adp.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(adp.deprel + '(' + parentstr + ', ADP)') + if el.parent == adp: + parentstr = 'ADP' + elif el.parent == que: + parentstr = 'que' + elif el.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(el.deprel + '(' + parentstr + ', el)') + # We found the verb as the parent of 'que', so we do not need to check the parent of 'que' now. + stanford.append(que.deprel + '(VERB, que)') + if verb.parent == adp: + parentstr = 'ADP' + elif verb.parent == el: + parentstr = 'el' + else: + parentstr = 'OTHER' + stanford.append(verb.deprel + '(' + parentstr + ', VERB)') + print('; '.join(stanford)) + + def fix_pattern(self, adp, el, que, verb): + if adp: + if adp.parent == que or adp.parent == verb: + attach(adp, el, 'case') + if el.parent == que: + ###!!! Just a temporary change. In the end it will be attached elsewhere. + attach(el, verb) + el.parent = verb + if len(el.deps) == 1: + el.deps[0]['parent'] = verb + if verb.parent != adp and verb.parent != el and verb.parent != que: + eldeprel = None + if re.match(r'^[nc]subj$', verb.udeprel): + eldeprel = 'nsubj' + elif re.match(r'^ccomp$', verb.udeprel): + eldeprel = 'obj' + elif re.match(r'^advcl$', verb.udeprel): + eldeprel = 'obl' + elif re.match(r'^acl$', verb.udeprel): + eldeprel = 'nmod' + elif re.match(r'^(xcomp|conj|appos|root)$', verb.udeprel): + eldeprel = verb.deprel + if eldeprel: + attach(el, verb.parent, eldeprel) + attach(verb, el, 'acl:relcl') + # If anything before 'el' depends on the verb ('cc', 'mark', 'punct' etc.), + # re-attach it to 'el'. + for c in verb.children: + if c.ord < el.ord and re.match(r'^(cc|mark|case|punct)$', c.udeprel): + attach(c, el) + +def attach(node, parent, deprel=None): + """ + Attach a node to a new parent with a new deprel in the basic tree. In + addition, if there are enhanced dependencies and there is just one incoming + enhanced relation (this is the case in AnCora), this relation will be + modified accordingly. + """ + node.parent = parent + if deprel: + node.deprel = deprel + if len(node.deps) == 1: + node.deps[0]['parent'] = parent + if deprel: + node.deps[0]['deprel'] = deprel diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index cead294a..b36b2512 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -24,7 +24,7 @@ """ import difflib import logging -import re +import regex from udapi.core.block import Block from udapi.core.mwt import MWT @@ -34,7 +34,9 @@ class ComplyWithText(Block): """Adapt the nodes to comply with the text.""" def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, - **kwargs): + allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, + previous_form_label='CorrectForm', previous_text_label='OrigText', + added_label='Added', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. Should we edit the text to match the token forms (as a last resort)? Default=True. @@ -54,33 +56,66 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ Default=True (i.e. add the goeswith nodes if applicable). max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words). Default=4. + allow_add_punct - allow creating punctuation-only nodes + allow_delete_punct - allow deleting extra punctuation-only nodes, + which are not represented in root.text + allow_hyphen_goeswith - if e.g. node.form=="mother-in-law" corresponds to + "mother in law" in root.text, convert it to three nodes: + node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law") + node2(form="in", deprel="goeswith", upos="X", parent=node1) + node3(form="law", deprel="goeswith", upos="X", parent=node1). + previous_form_label - when changing node.form, we store the previous value + in node.misc[previous_form_label] (so no information is lost). + Default="CorrectForm" because we expect that the previous value + (i.e. the value of node.form before applying this block) + contained the corrected spelling, while root.text contains + the original spelling with typos as found in the raw text. + CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html + When setting this parameter to an empty string, no values will be stored to node.misc. + When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. + previous_text_label - when we are not able to adapt the annotation to match root.text + and fix_text is True, we store the previous root.text value in a CoNLL-U comment with this label. + Default="OrigText". When setting this parameter to an empty string, + no values will be stored to root.comment. + added_label - when creating new nodes because allow_add_punct=True, we mark these nodes + as new_node.misc[added_label] = 1. Default="Added". """ super().__init__(**kwargs) self.fix_text = fix_text self.prefer_mwt = prefer_mwt self.allow_goeswith = allow_goeswith self.max_mwt_length = max_mwt_length + self.allow_add_punct = allow_add_punct + self.allow_delete_punct = allow_delete_punct + self.allow_hyphen_goeswith = allow_hyphen_goeswith + self.previous_form_label = previous_form_label + self.previous_text_label = previous_text_label + self.added_label = added_label @staticmethod def allow_space(form): """Is space allowed within this token form?""" - return re.fullmatch('[0-9 ]+([,.][0-9]+)?', form) + return regex.fullmatch('[0-9 ]+([,.][0-9]+)?', form) - @staticmethod - def store_orig_form(node, new_form): - """Store the original form of this node into MISC, unless the change is common&expected.""" - _ = new_form - if node.form not in ("''", "``"): - node.misc['OrigForm'] = node.form + def store_previous_form(self, node): + """Store the previous form of this node into MISC, unless the change is common&expected.""" + if node.form not in ("''", "``") and self.previous_form_label: + node.misc[self.previous_form_label] = node.form + if self.previous_form_label == 'CorrectForm': + node.feats['Typo'] = 'Yes' def process_tree(self, root): text = root.text if text is None: raise ValueError('Tree %s has no text, cannot use ud.ComplyWithText' % root) - # Normalize the stored text (double space -> single space) + # Normalize the stored text (e.g. double space or no-break space -> single space) # and skip sentences which are already ok. text = ' '.join(text.split()) + if root.text != text and self.fix_text: + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') + root.text = text if text == root.compute_text(): return @@ -112,13 +147,14 @@ def process_tree(self, root): node.misc['SpaceAfter'] = 'No' else: logging.warning('Node %s does not match text "%s"', node, tmp_text[:20]) - return + break # Edit root.text if needed. if self.fix_text: computed_text = root.compute_text() if text != computed_text: - root.add_comment('ToDoOrigText = ' + root.text) + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') root.text = computed_text def unspace_diffs(self, orig_diffs, tree_chars, text): @@ -130,6 +166,10 @@ def unspace_diffs(self, orig_diffs, tree_chars, text): tree_lo += 1 if tree_chars[tree_hi - 1] == ' ': tree_hi -= 1 + if text[text_lo] == ' ': + text_lo += 1 + if text[text_hi - 1] == ' ': + text_hi -= 1 old = tree_chars[tree_lo:tree_hi] new = text[text_lo:text_hi] if old == '' and new == '': @@ -181,18 +221,37 @@ def solve_diffs(self, diffs, tree_chars, char_nodes, text): for diff in diffs: edit, tree_lo, tree_hi, text_lo, text_hi = diff - # Focus only on edits of type 'replace', log insertions and deletions as failures. if edit == 'equal': - continue - if edit in ('insert', 'delete'): - logging.warning('Unable to solve token-vs-text mismatch\n%s', - _diff2str(diff, tree_chars, text)) - continue - - # Revert the splittng and solve the diff. - nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] - form = text[text_lo:text_hi] - self.solve_diff(nodes, form.strip()) + pass + elif edit == 'insert': + forms = text[text_lo:text_hi].split(' ') + if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct: + next_node = char_nodes[tree_lo] + for f in reversed(forms): + new = next_node.create_child(form=f, deprel='punct', upos='PUNCT') + new.shift_before_node(next_node) + new.misc[self.added_label] = 1 + else: + logging.warning('Unable to insert nodes\n%s', + _diff2str(diff, tree_chars, text)) + elif edit == 'delete': + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + if all(regex.fullmatch('\p{P}+', n.form) for n in nodes): + if self.allow_delete_punct: + for node in nodes: + node.remove(children='rehang') + else: + logging.warning('Unable to delete punctuation nodes (try ud.ComplyWithText allow_delete_punct=1)\n%s', + _diff2str(diff, tree_chars, text)) + else: + logging.warning('Unable to delete non-punctuation nodes\n%s', + _diff2str(diff, tree_chars, text)) + else: + assert edit == 'replace' + # Revert the splittng and solve the diff. + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + form = text[text_lo:text_hi] + self.solve_diff(nodes, form.strip()) def solve_diff(self, nodes, form): """Fix a given (minimal) tokens-vs-text inconsistency.""" @@ -201,20 +260,33 @@ def solve_diff(self, nodes, form): # First, solve the cases when the text contains a space. if ' ' in form: - if len(nodes) == 1 and node.form == form.replace(' ', ''): - if self.allow_space(form): - self.store_orig_form(node, form) - node.form = form - elif self.allow_goeswith: - forms = form.split() - node.form = forms[0] - for split_form in reversed(forms[1:]): - new = node.create_child(form=split_form, deprel='goeswith', upos=node.upos) + node_form = node.form + if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form: + node_form = node_form.replace('-', '') + if len(nodes) == 1: + if node_form == form.replace(' ', ''): + if self.allow_space(form): + self.store_previous_form(node) + node.form = form + elif self.allow_goeswith: + self.store_previous_form(node) + forms = form.split() + node.form = forms[0] + node.feats['Typo'] = 'Yes' + for split_form in reversed(forms[1:]): + new = node.create_child(form=split_form, deprel='goeswith', upos='X') + new.shift_after_node(node) + else: + logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) + elif self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('[ \p{P}]+', form[len(node.form):]): + for punct_form in reversed(form[len(node.form):].split()): + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') new.shift_after_node(node) + new.misc[self.added_label] = 1 else: logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) else: - logging.warning('Unable to solve n:m diff:\n%s -> %s', nodes_str, form) + logging.warning(f'Unable to solve {len(nodes)}:{len(form.split(" "))} diff:\n{nodes_str} -> {form}') # Second, solve the cases when multiple nodes match one form (without any spaces). elif len(nodes) > 1: @@ -235,8 +307,14 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: - self.store_orig_form(node, form) - node.form = form + if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]): + punct_form = form[len(node.form):] + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') + new.shift_after_node(node) + new.misc[self.added_label] = 1 + else: + self.store_previous_form(node) + node.form = form def _nodes_to_chars(nodes): @@ -261,6 +339,4 @@ def _log_diffs(diffs, tree_chars, text, msg): def _diff2str(diff, tree, text): old = '|' + ''.join(tree[diff[1]:diff[2]]) + '|' new = '|' + ''.join(text[diff[3]:diff[4]]) + '|' - if diff[0] == 'equal': - return '{:7} {!s:>50}'.format(diff[0], old) return '{:7} {!s:>50} --> {!s}'.format(diff[0], old, new) diff --git a/udapi/block/ud/cs/addmwt.py b/udapi/block/ud/cs/addmwt.py index 4c203ddc..c1b3783a 100644 --- a/udapi/block/ud/cs/addmwt.py +++ b/udapi/block/ud/cs/addmwt.py @@ -1,17 +1,26 @@ """Block ud.cs.AddMwt for heuristic detection of multi-word tokens.""" import udapi.block.ud.addmwt +import re +import logging +# Define static rules for 'aby', 'kdyby' and similar forms. MWTS = { - 'abych': {'form': 'aby bych', 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, - 'kdybych': {'form': 'když bych', 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, - 'abys': {'form': 'aby bys', 'feats': '_ Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, - 'kdybys': {'form': 'když bys', 'feats': '_ Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, - 'aby': {'form': 'aby by', 'feats': '_ Mood=Cnd|Person=3|VerbForm=Fin'}, - 'kdyby': {'form': 'když by', 'feats': '_ Mood=Cnd|Person=3|VerbForm=Fin'}, - 'abychom': {'form': 'aby bychom', 'feats': '_ Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, - 'kdybychom': {'form': 'když bychom', 'feats': '_ Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, - 'abyste': {'form': 'aby byste', 'feats': '_ Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, - 'kdybyste': {'form': 'když byste', 'feats': '_ Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'abych': {'form': 'aby bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, + 'kdybych': {'form': 'když bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, + 'abys': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'kdybys': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'aby': {'form': 'aby by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'}, + 'kdyby': {'form': 'když by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'}, + 'abychom': {'form': 'aby bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'kdybychom': {'form': 'když bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + # Old Czech 'abychme' == Modern Czech 'abychom' + 'abychme': {'form': 'aby bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'kdybychme': {'form': 'když bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'abyste': {'form': 'aby byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'kdybyste': {'form': 'když byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + # Old Czech 'abyšta' == dual number; 2nd or 3rd person, the one example in data so far is 3rd. + 'abyšta': {'form': 'aby byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'}, + 'kdybyšta': {'form': 'když byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'}, } for v in MWTS.values(): v['upos'] = 'SCONJ AUX' @@ -25,49 +34,153 @@ person = '1' elif 'Person=2' in v['feats']: person = '2' - v['xpos'] = 'J,------------- Vc-%s---%s-------' % (number, person) v['deprel'] = '* aux' v['lemma'] = v['form'].split()[0] + ' být' v['main'] = 0 v['shape'] = 'siblings' +# Define static rules for 'nač', 'oč', 'zač' (but not 'proč'). +# Add them to the already existing dictionary MWTS. # nač -> na + co -for prep in 'na za o'.split(): +for prep in 'na o za'.split(): MWTS[prep + 'č'] = { 'form': prep + ' co', 'lemma': prep + ' co', 'upos': 'ADP PRON', + 'xpos': 'RR--4---------- PQ--4----------', + 'feats': 'AdpType=Prep|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel', 'deprel': 'case *', 'main': 1, 'shape': 'subtree', } + class AddMwt(udapi.block.ud.addmwt.AddMwt): """Detect and mark MWTs (split them into words and add the words to the tree).""" def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + # Avoid adding a MWT if the current node already is part of an MWT. + if node.multiword_token: + return None analysis = MWTS.get(node.form.lower(), None) if analysis is not None: return analysis - - # There is no VerbType=verbconj in the UD_Czech data. - # The purpose of this rule is rather to show that - # it is possible to write such "dynamic" rules - # (which cannot be included in static MWTS). - if node.form.lower().endswith('ť') and node.feats['VerbType'] == 'verbconj': - return { - 'form': node.form.lower()[:-1] + ' neboť', - 'lemma': '* neboť', - 'upos': '* CCONJ', - 'xpos': 'Vt-S---3P-NA--2 J^-------------', - 'feats': '* _', - 'deprel': '* cc', - 'main': 0, - 'shape': 'subtree', - } + # If the node did not match any of the static rules defined in MWTS, + # check it against the "dynamic" rules below. The enclitic 'ť' will be + # separated from its host but only if it has been marked by an annotator + # in MISC. (These are annotation conventions used for Old Czech in the + # Hičkok project.) + if node.misc['AddMwt'] != '': + subtokens = node.misc['AddMwt'].split() + if len(subtokens) != 2: + logging.warning("MISC 'AddMwt=%s' has unexpected number of subtokens." % node.misc['AddMwt']) + return None + token_from_subtokens = ''.join(subtokens) + if subtokens[1] == 'jsi': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' jsi', + 'lemma': '* být', + 'upos': '* AUX', + 'xpos': '* VB-S---2P-AAI--', + 'feats': '* Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act', + 'deprel': '* aux', + 'main': 0, + 'shape': 'subtree' if node.upos in ['VERB'] else 'siblings', + } + if subtokens[1] == 'i': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' i', + 'lemma': '* i', + 'upos': '* CCONJ', + 'xpos': '* J^-------------', + 'feats': '* _', + 'deprel': '* cc', + 'main': 0, + 'shape': 'subtree', + } + if subtokens[1] in ['ť', 'tě', 'ti']: + if token_from_subtokens != node.form: + logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form)) + return None + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' ' + subtokens[1], + 'lemma': '* ť', + 'upos': '* PART', + 'xpos': '* TT-------------', + 'feats': '* _', + 'deprel': '* discourse', + 'main': 0, + 'shape': 'subtree', + } + # Contractions of prepositions and pronouns almost could be processed + # regardless of AddMwt instructions by the annotator, but we still + # require it to be on the safe side. For example, both 'přědeň' and + # 'přěden' are attested in Old Czech but then we do not want to catch + # 'on' (besides the wanted 'oň'). Another reason si that the pronoun + # could be masculine or neuter. We pick Gender=Masc and Animacy=Anim + # by default, unless the original token was annotated as Animacy=Inan + # or Gender=Neut. + m = re.match(r"^(na|nade|o|pro|přěde|ski?rz[eě]|za)[nň](ž?)$", node.form.lower()) + if m: + node.misc['AddMwt'] = '' + # Remove vocalization from 'přěde' (přěd něj) but keep it in 'skrze' + # (skrze něj). + if m.group(1) == 'přěde': + pform = 'přěd' + plemma = 'před' + adptype = 'Voc' + at = 'V' + elif re.match(r"^ski?rz[eě]$", m.group(1).lower()): + pform = m.group(1) + plemma = 'skrz' + adptype = 'Voc' + at = 'V' + else: + pform = m.group(1) + plemma = m.group(1) + adptype = 'Prep' + at = 'R' + # In UD PDT, Gender=Masc,Neut, and in PDT it is PEZS4--3 / P4ZS4---. + if node.feats['Gender'] == 'Neut': + gender = 'Neut' + animacy = '' + g = 'N' + elif node.feats['Animacy'] == 'Inan': + gender = 'Masc' + animacy = 'Animacy=Inan|' + g = 'I' + else: + gender = 'Masc' + animacy = 'Animacy=Anim|' + g = 'M' + if m.group(2).lower() == 'ž': + return { + 'form': pform + ' nějž', + 'lemma': plemma + ' jenž', + 'upos': 'ADP PRON', + 'xpos': 'R'+at+'--4---------- P4'+g+'S4---------2', + 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|PrepCase=Pre|PronType=Rel', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } + else: + return { + 'form': pform + ' něj', + 'lemma': plemma + ' on', + 'upos': 'ADP PRON', + 'xpos': 'R'+at+'--4---------- PE'+g+'S4--3-------', + 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|Person=3|PrepCase=Pre|PronType=Prs', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } return None def postprocess_mwt(self, mwt): diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py new file mode 100644 index 00000000..bd85e1b4 --- /dev/null +++ b/udapi/block/ud/cs/fixedeprels.py @@ -0,0 +1,615 @@ +"""Block to fix case-enhanced dependency relations in Czech.""" +from udapi.core.block import Block +import re + +class FixEdeprels(Block): + + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'jako' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'aby': [], + 'ač': [], + 'ačkoli': [], # 'ačkoliv' se převede na 'ačkoli' dole + 'ačkoliv': [], # ... ale možná ne když je doprovázeno předložkou + 'ať': [], + 'byť': [], + 'i_když': [], + 'jak': [], + 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole + 'jako': [], + 'jakoby': ['jakoby_pod:ins'], # these instances in FicTree should be spelled 'jako by' + 'když': [], + 'než': ['než_aby'], + 'nežli': [], + 'pokud': [], + 'protože': [], + 'takže': [], + 'třebaže': [], + 'že': [] + } + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'á': 'na:acc', # "á konto té záležitosti", ovšem "á konto" není ani spojeno jako složená předložka (význam = "na konto") + 'abi': 'aby', + 'aby_na': 'na:loc', + 'ačkoliv': 'ačkoli', + 'ať': 'ať', # remove morphological case + 'ať_forma': 'formou:gen', + 'ať_na': 'na:loc', + 'ať_s': 's:ins', + 'ať_v': 'v:loc', + 'ať_v_oblast': 'v_oblasti:gen', + 'ať_z': 'z:gen', + 'ať_z_hledisko': 'z_hlediska:gen', + 'ať_z_strana': 'ze_strany:gen', + 'až_do': 'do:gen', + 'až_o': 'o:acc', + 'během': 'během:gen', + 'bez': 'bez:gen', + 'bez_ohled_na': 'bez_ohledu_na:acc', + 'bez_na': 'bez_ohledu_na:acc', ###!!! a temporary hack to silence the validator about (https://github.com/UniversalDependencies/UD_Czech-PDT/issues/10#issuecomment-2710721703) + 'bez_zřetel_k': 'bez_zřetele_k:dat', + 'bez_zřetel_na': 'bez_zřetele_na:acc', + 'blízko': 'blízko:dat', + 'blízko_k': 'blízko:dat', + 'blíž': 'blízko:dat', + 'blíže': 'blízko:dat', + 'bok_po_bok_s': 'bok_po_boku_s:ins', + 'cesta': 'cestou:gen', + 'coby': 'coby', # remove morphological case + 'daleko': 'nedaleko:gen', + 'daleko_od': 'od:gen', + 'dík': 'díky:dat', + 'díky': 'díky:dat', + 'dle': 'dle:gen', + 'do': 'do:gen', + 'do_čelo': 'do_čela:gen', + 'do_k': 'k:dat', + 'do_oblast': 'do_oblasti:gen', + 'do_rozpor_s': 'do_rozporu_s:ins', + 'do_ruka': 'do_rukou:gen', + 'do_soulad_s': 'do_souladu_s:ins', + 'důsledkem': 'v_důsledku:gen', + 'forma': 'formou:gen', + 'formou': 'formou:gen', + 'hledět_na': 'nehledě_na:acc', + 'i_když': 'i_když', # remove morphological case + 'i_pro': 'pro:acc', + 'jak_aby': 'jak', + 'jak_ad': 'jak', + 'jakkoliv': 'jakkoli', + 'jako': 'jako', # remove morphological case + 'jako_kupříkladu': 'jako', + 'jakoby': 'jako', + 'jakoby_pod': 'pod:ins', + 'jakožto': 'jako', + 'jelikož_do': 'jelikož', + 'jenom': 'jen', + 'jesli': 'jestli', + 'jestli_že': 'jestliže', + 'jménem': 'jménem:gen', + 'k': 'k:dat', + 'k_konec': 'ke_konci:gen', + 'k_prospěch': 'ku_prospěchu:gen', + 'kdykoliv': 'kdykoli', + 'kol': 'kolem:gen', + 'kolem': 'kolem:gen', + 'kolem_dokola': 'kolem:gen', + 'koncem': 'koncem:gen', + 'konec': 'koncem:gen', + 'krom': 'kromě:gen', + 'kromě': 'kromě:gen', + 'kvůli': 'kvůli:dat', + 'leda_když': 'ledaže', + 'li_jako': 'li', + 'liž': 'li', + 'mezi_uvnitř': 'uvnitř:gen', + 'na:ins': 'na:acc', + 'na_báze': 'na_bázi:gen', + 'na_čelo': 'na_čele:gen', + 'na_mimo': 'na:loc', # na kurtě i mimo něj + 'na_než': 'na:acc', # na víc než čtyři a půl kilometru + 'na_od': 'na_rozdíl_od:gen', + 'na_počátek': 'na_počátku:gen', + 'na_počest': 'na_počest:gen', # appears also with :dat but the meaning is same + 'na_podklad': 'na_podkladě:gen', + 'na_rozdíl_od': 'na_rozdíl_od:gen', + 'na_strana': 'na_straně:gen', + 'na_účet': 'na_účet:gen', + 'na_újma': 'gen', # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier + 'na_úroveň': 'na_úrovni:gen', + 'na_úroveň_okolo': 'na_úrovni:gen', + 'na_úsek': 'na_úseku:gen', + 'na_začátek': 'na_začátku:gen', + 'na_základ': 'na_základě:gen', + 'na_základna': 'na_základně:gen', + 'na_závěr': 'na_závěr:gen', + 'na_zda': 'na:loc', # na tom, zda a v jaké formě... + 'namísto': 'namísto:gen', + 'namísto_do': 'do:gen', + 'napospas': 'napospas:dat', + 'narozdíl_od': 'na_rozdíl_od:gen', + 'následek': 'následkem:gen', + 'navzdory': 'navzdory:dat', + 'nedaleko': 'nedaleko:gen', + 'než': 'než', # remove morphological case + 'nežli': 'nežli', # remove morphological case + 'o_jako': 'jako', + 'o_o': 'o:acc', + 'od': 'od:gen', + 'od_počínaje': 'počínaje:ins', # od brambor počínaje a základní zeleninou konče + 'ohledně': 'ohledně:gen', + 'okolo': 'okolo:gen', + 'oproti': 'oproti:dat', + 'po_v': 'po:loc', + 'po_bok': 'po_boku:gen', + 'po_doba': 'po_dobu:gen', + 'po_stránka': 'po_stránce:gen', + 'po_vzor': 'po_vzoru:gen', + 'poblíž': 'poblíž:gen', + 'počátek': 'počátkem:gen', + 'počátkem': 'počátkem:gen', + 'počínaje': 'počínaje:ins', + 'počínat': 'počínaje:ins', + 'počínat_od': 'počínaje:ins', + 'pod_dojem': 'pod_dojmem:gen', + 'pod_tlak': 'pod_tlakem:gen', + 'pod_vliv': 'pod_vlivem:gen', + 'pod_záminka': 'pod_záminkou:gen', + 'pod_záminka_že': 'pod_záminkou_že', + 'podél': 'podél:gen', + 'podle': 'podle:gen', + 'pomoc': 'pomocí:gen', + 'pomocí': 'pomocí:gen', + 'postup': 'postupem:gen', + 'pouze_v': 'v:loc', + 'pro': 'pro:acc', + 'pro_aby': 'pro:acc', + 'prostřednictví': 'prostřednictvím:gen', + 'prostřednictvím': 'prostřednictvím:gen', + 'proti': 'proti:dat', + 'proto_aby': 'aby', + 'protože': 'protože', # remove morphological case + 'před_během': 'během:gen', # před a během utkání + 'před_po': 'po:loc', # před a po vyloučení Schindlera + 'přes': 'přes:acc', + 'přes_přes': 'přes:acc', # annotation error + 'přestože': 'přestože', # remove morphological case + 'při': 'při:loc', + 'při_pro': 'při:loc', + 'při_příležitost': 'při_příležitosti:gen', + 'ruka_v_ruka_s': 'ruku_v_ruce_s:ins', + 's_cíl': 's_cílem', # s cílem projednat X + 's_ohled_k': 's_ohledem_k:dat', + 's_ohled_na': 's_ohledem_na:acc', + 's_pomoc': 's_pomocí:gen', + 's_postup': 'postupem:gen', + 's_přihlédnutí_k': 's_přihlédnutím_k:dat', + 's_přihlédnutí_na': 's_přihlédnutím_na:acc', + 's_výjimka': 's_výjimkou:gen', + 's_výjimka_z': 's_výjimkou:gen', + 's_výjimka_že': 's_výjimkou_že', + 's_vyloučení': 's_vyloučením:gen', + 's_zřetel_k': 'se_zřetelem_k:dat', + 's_zřetel_na': 'se_zřetelem_na:acc', + 'severně_od': 'od:gen', + 'skrz': 'skrz:acc', + 'směr_do': 'směrem_do:gen', + 'směr_k': 'směrem_k:dat', + 'směr_na': 'směrem_na:acc', + 'směr_od': 'směrem_od:gen', + 'směr_přes': 'směrem_přes:acc', + 'směr_z': 'směrem_z:gen', + 'společně_s': 'společně_s:ins', + 'spolu': 'spolu_s:ins', + 'spolu_s': 'spolu_s:ins', + 'spolu_se': 'spolu_s:ins', + 'stranou': 'stranou:gen', + 'stranou_od': 'stranou:gen', + 'takže': 'takže', # remove morphological case + 'takže_a': 'takže', + 'třebaže': 'třebaže', # remove morphological case + 'tvář_v_tvář': 'tváří_v_tvář:dat', + 'u': 'u:gen', + 'u_příležitost': 'u_příležitosti:gen', + 'uprostřed': 'uprostřed:gen', + 'uvnitř': 'uvnitř:gen', + 'v:ins': 'v:loc', # ve skutečností (překlep) + 'v_analogie_s': 'v_analogii_s:ins', + 'v_blízkost': 'v_blízkosti:gen', + 'v_čas': 'v_čase:gen', + 'v_čelo': 'v_čele:gen', + 'v_čelo_s': 'v_čele_s:ins', + 'v_doba': 'v_době:gen', + 'v_dohoda_s': 'v_dohodě_s:ins', + 'v_duch': 'v_duchu:gen', + 'v_důsledek': 'v_důsledku:gen', + 'v_forma': 've_formě:gen', + 'v_jméno': 've_jménu:gen', + 'v_k': 'k:dat', + 'v_kombinace_s': 'v_kombinaci_s:ins', + 'v_konfrontace_s': 'v_konfrontaci_s:ins', + 'v_kontext_s': 'v_kontextu_s:ins', + 'v_na': 'na:loc', + 'v_neprospěch': 'v_neprospěch:gen', + 'v_oblast': 'v_oblasti:gen', + 'v_oblast_s': 's:ins', + 'v_obor': 'v_oboru:gen', + 'v_otázka': 'v_otázce:gen', + 'v_podoba': 'v_podobě:gen', + 'v_poměr_k': 'v_poměru_k:dat', + 'v_porovnání_s': 'v_porovnání_s:ins', + 'v_proces': 'v_procesu:gen', + 'v_prospěch': 've_prospěch:gen', + 'v_protiklad_k': 'v_protikladu_k:dat', + 'v_průběh': 'v_průběhu:gen', + 'v_případ': 'v_případě:gen', + 'v_případ_že': 'v_případě_že', + 'v_rámec': 'v_rámci:gen', + 'v_reakce_na': 'v_reakci_na:acc', + 'v_rozpor_s': 'v_rozporu_s:ins', + 'v_řada': 'v_řadě:gen', + 'v_shoda_s': 've_shodě_s:ins', + 'v_služba': 've_službách:gen', + 'v_směr': 've_směru:gen', + 'v_směr_k': 've_směru_k:dat', + 'v_směr_na': 've_směru_k:dat', # same meaning as ve_směru_na:acc + 'v_smysl': 've_smyslu:gen', + 'v_součinnost_s': 'v_součinnosti_s:ins', + 'v_souhlas_s': 'v_souhlasu_s:ins', + 'v_soulad_s': 'v_souladu_s:ins', + 'v_souvislost_s': 'v_souvislosti_s:ins', + 'v_spojení_s': 've_spojení_s:ins', + 'v_spojení_se': 've_spojení_s:ins', + 'v_spojený_s': 've_spojení_s:ins', + 'v_spojitost_s': 've_spojitosti_s:ins', + 'v_spolupráce_s': 've_spolupráci_s:ins', + 'v_s_spolupráce': 've_spolupráci_s:ins', + 'v_srovnání_s': 've_srovnání_s:ins', + 'v_srovnání_se': 've_srovnání_s:ins', + 'v_stav': 've_stavu:gen', + 'v_stín': 've_stínu:gen', + 'v_světlo': 've_světle:gen', + 'v_úroveň': 'v_úrovni:gen', + 'v_věc': 've_věci:gen', + 'v_vztah_k': 've_vztahu_k:dat', + 'v_vztah_s': 've_vztahu_k:dat', + 'v_zájem': 'v_zájmu:gen', + 'v_záležitost': 'v_záležitosti:gen', + 'v_závěr': 'v_závěru:gen', + 'v_závislost_na': 'v_závislosti_na:loc', + 'v_závislost_s': 'v_závislosti_s:ins', + 'v_znamení': 've_znamení:gen', + 'včetně': 'včetně:gen', + 'vedle': 'vedle:gen', + 'versus': 'versus:nom', + 'vina': 'vinou:gen', + 'vliv': 'vlivem:gen', + 'vlivem': 'vlivem:gen', + 'vůči': 'vůči:dat', + 'výměna_za': 'výměnou_za:acc', + 'vzhledem': 'vzhledem_k:dat', + 'vzhledem_k': 'vzhledem_k:dat', + 'z': 'z:gen', + 'z_důvod': 'z_důvodu:gen', + 'z_hledisko': 'z_hlediska:gen', + 'z_oblast': 'z_oblasti:gen', + 'z_řada': 'z_řad:gen', + 'z_strana': 'ze_strany:gen', + 'z_nedostatek': 'z_nedostatku:gen', + 'z_titul': 'z_titulu:gen', + 'z_začátek': 'ze_začátku:gen', + 'za_pomoc': 'za_pomoci:gen', + 'za_účast': 'za_účasti:gen', + 'za_účel': 'za_účelem:gen', + 'začátek': 'začátkem:gen', + 'zásluha': 'zásluhou:gen', + 'zatím_co': 'zatímco', + 'závěr': 'závěrem:gen', + 'závisle_na': 'nezávisle_na:loc', + 'že': 'že', # remove morphological case + 'že_ať': 'ať', + 'že_jako': 'že', + 'že_jakoby': 'že', + 'že_za': 'za:gen' + } + + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Czech basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) + if m: + solved = False + # Issues caused by errors in the original annotation must be fixed early. + # Especially if acl|advcl occurs with a preposition that unambiguously + # receives a morphological case in the subsequent steps, and then gets + # flagged as solved. + edep['deprel'] = re.sub(r'^advcl:do(?::gen)?$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + edep['deprel'] = re.sub(r'^advcl:pro(?::acc)?$', r'advcl:aby', edep['deprel']) # byl by pro, abychom... ###!!! Opravit i konverzi stromu. + edep['deprel'] = re.sub(r'^advcl:s(?::ins)?$', r'advcl', edep['deprel']) ###!!! "seděli jsme tam s Člověče, nezlob se!" Měla by se opravit konverze stromu. + edep['deprel'] = re.sub(r'^acl:k(?::dat)?$', r'acl', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:k(?::dat)?$', r'obl:k:dat', edep['deprel']) ###!!! Ale měli bychom opravit i závislost v základním stromu! + edep['deprel'] = re.sub(r'^advcl:místo(?::gen)?$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' + edep['deprel'] = re.sub(r'^acl:na_způsob(?::gen)?$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' + edep['deprel'] = re.sub(r'^acl:od(?::gen)?$', r'nmod:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:od(?::gen)?$', r'obl:od:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + edep['deprel'] = re.sub(r'^advcl:podle(?::gen)?$', r'obl:podle:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:pro(?::acc)?$', r'obl:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v_duchu?(?::gen)?$', r'obl:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^nmod:když.*$', r'nmod', edep['deprel']) # nadějí když ne na zbohatnutí, tak alespoň na dobrou obživu ###!!! perhaps "když" or "když ne" should be analyzed as "cc" here! + edep['deprel'] = re.sub(r'^obl:ačkoli.*$', r'obl', edep['deprel']) # nadějí když ne na zbohatnutí, tak alespoň na dobrou obživu ###!!! perhaps "když" or "když ne" should be analyzed as "cc" here! + edep['deprel'] = re.sub(r'^obl:jestli(?::gen)?$', r'obl:gen', edep['deprel']) # nevím, jestli osmého nebo devátého září + # Removing 'až' must be done early. The remainder may be 'počátek' + # and we will want to convert it to 'počátkem:gen'. + edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'jako_v' becomes just 'jako'. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) + if m and m.group(2) and not x+m.group(2) in exceptions: + edep['deprel'] = m.group(1)+':'+x + solved = True + break + if solved: + continue + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + if solved: + continue + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. + m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase and not re.search(r':(nom|gen|dat|voc)$', adpcase): + edep['deprel'] = m.group(1)+':'+adpcase + continue + if re.match(r'^(acl|advcl):', edep['deprel']): + # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). + edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating + edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) + if edep['deprel'] == 'acl:v' and node.form == 'patře': + edep['deprel'] = 'nmod:v:loc' + node.deprel = 'nmod' + node.lemma = 'patro' + node.upos = 'NOUN' + node.xpos = 'NNNS6-----A----' + node.feats['Aspect'] = '' + node.feats['Gender'] = 'Neut' + node.feats['Tense'] = '' + node.feats['VerbForm'] = '' + node.feats['Voice'] = '' + elif re.match(r'^(nmod|obl(:arg)?):', edep['deprel']): + if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': + # This is a same-case noun-noun modifier, which just happens to be in the locative. + # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has + # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. + edep['deprel'] = 'nmod' + elif edep['deprel'] == 'obl:loc': + # Annotation error. The first occurrence in PDT dev: + # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' + # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. + # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. + edep['deprel'] = 'obl:v:loc' + elif edep['deprel'] == 'obl:arg:loc': + # Annotation error. The first occurrence in PDT dev: + edep['deprel'] = 'obl:arg:na:loc' + elif edep['deprel'] == 'nmod:loc': + # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': + # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? + # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. + edep['deprel'] = 'obl' + elif edep['deprel'] == 'nmod:voc': + # 'v 8. čísle tiskoviny Ty rudá krávo' + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'nmod:co:nom': + # Annotation error: 'kompatibilní znamená tolik co slučitelný' + # 'co' should be relative pronoun rather than subordinating conjunction. + edep['deprel'] = 'acl:relcl' + node.deprel = 'acl:relcl' + elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): + edep['deprel'] = 'advcl:li' + elif re.match(r'^(nmod|obl(:arg)?):mezi:voc$', edep['deprel']): + edep['deprel'] = re.sub(r':voc$', r':acc', edep['deprel']) + elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):mimo$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):místo$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^obl:místo_za:acc$', edep['deprel']): + # 'chytají krávu místo za rohy spíše za ocas' + # This should be treated as coordination; 'místo' and 'spíše' are adverbs (???); 'case' for 'místo' does not seem to be the optimal solution. + for c in node.children: + if c.form == 'místo': + c.upos = 'ADV' + c.deprel = 'cc' + edep['deprel'] = 'obl:za:acc' + elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) + elif re.match(r'^(nmod|obl(:arg)?):na(:gen)?$', edep['deprel']): + edep['deprel'] = re.sub(r':gen$', '', edep['deprel']) + # The case is unknown. We need 'acc' or 'loc'. + # The locative is probably more frequent but it is not so likely with every noun. + # If there is an nummod:gov child, it must be accusative and not locative. + # (The case would be taken from the number but if it is expressed as digits, it does not have the case feature.) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + elif re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:arg:na_konec$', edep['deprel']): + # Annotation error. It should have been two prepositional phrases: 'snížil na 225 tisíc koncem minulého roku' + edep['deprel'] = 'obl:arg:na:acc' + elif re.match(r'^(nmod|obl(:arg)?):nad$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): + # Annotation error. + if node.form == 's': + ohled = node.next_node + na = ohled.next_node + noun = na.next_node + self.set_basic_and_enhanced(noun, node.parent, 'obl', 'obl:s_ohledem_na:acc') + self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(node, noun, 'case', 'case') + elif re.match(r'^nmod:pára:nom$', edep['deprel']): + # Annotation error: 'par excellence'. + edep['deprel'] = 'nmod' + for c in node.children: + if c.udeprel == 'case' and c.form.lower() == 'par': + c.lemma = 'par' + c.upos = 'ADP' + c.xpos = 'RR--X----------' + c.feats['Case'] = '' + c.feats['Gender'] = '' + c.feats['Number'] = '' + c.feats['Polarity'] = '' + c.feats['AdpType'] = 'Prep' + elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): + # Accusative would be possible but unlikely. + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): + # Genitive would be possible but unlikely. + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': + # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. + # Find the content nominal. + cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] + vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] + if len(cnouns) > 0 and len(vs) > 0: + cnoun = cnouns[0] + v = vs[0] + self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') + self.set_basic_and_enhanced(v, cnoun, 'case', 'case') + self.set_basic_and_enhanced(node, v, 'fixed', 'fixed') + elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): + # ':nom' occurs in 'karneval v Rio de Janeiro' + edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:v_čel[eo]_s:ins$', edep['deprel']): + # There is just one occurrence and it is an error: + # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' + # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. + edep['deprel'] = 'obl:s:ins' + elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): + # Instrumental would be possible but unlikely. + edep['deprel'] += ':acc' + else: + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_?l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py new file mode 100644 index 00000000..17570ee2 --- /dev/null +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -0,0 +1,929 @@ +""" +Block to identify missing or ill-valued features in Czech. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + # The convention used in PDT is not consistent. Adjectives are fully disambiguated + # (three genders, two animacies, three numbers, seven cases), even though some + # forms are shared among many feature combinations. On the other hand, pronouns + # and determiners omit some features in the context of certain values of other + # features (e.g., gender and animacy are not distinguished in plural if the case + # is genitive, dative, locative or instrumental). + # In contrast, ČNK (CNC) fully disambiguates pronouns and determiners just like + # adjectives. + # Here we can trigger one of the two conventions. It should become a block parameter + # in the future. + pdt20 = False # True = like in PDT 2.0; False = like in ČNK + + def process_node(self, node): + # Czech constraints should not be applied to foreign words. + if node.feats['Foreign'] == 'Yes': + pass + # NOUNS ################################################################ + elif node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case']) + if node.feats['VerbForm'] == 'Vnoun': + # verbal nouns: bytí, dělání, ... + self.check_allowed_features(node, { + 'VerbForm': ['Vnoun'], + 'Gender': ['Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes'] + }) + elif node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + # PROPER NOUNS ######################################################### + elif node.upos == 'PROPN': + self.check_required_features(node, ['Gender', 'Number', 'Case']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur', 'Geo', 'Nat'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur', 'Geo', 'Nat'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + if node.feats['Poss'] == 'Yes': # possessive adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names + 'Emph': ['Yes'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names + 'Emph': ['Yes'], + 'Foreign': ['Yes']}) + elif node.feats['NumType'] == 'Ord' or node.feats['NumType'] == 'Mult': # ordinal numerals are a subtype of adjectives; same for some multiplicative numerals (dvojí, trojí) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord', 'Mult'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'], # sedmer (Mult Short) duch tvój; pól čtverta (Ord Short) komára + 'Emph': ['Yes'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord', 'Mult'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes']}) + elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives + self.check_required_features(node, ['VerbForm', 'Voice']) + if node.feats['Voice'] == 'Act': # active participles have tense, passives don't but they have degree + if node.feats['Gender'] == 'Masc': + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes']}) + else: + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes']}) + else: + if node.feats['Gender'] == 'Masc': + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Degree']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes']}) + else: + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Number', 'Case', 'Polarity', 'Degree']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes']}) + else: # regular adjectives, including short forms + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + self.check_required_features(node, ['PronType']) + if node.feats['PronType'] == 'Prs': + if node.feats['Reflex'] == 'Yes': + self.check_required_features(node, ['PronType', 'Reflex', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Reflex': ['Yes'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + else: # not reflexive + if node.feats['Person'] == '3': # on, ona, ono, oni, ony + if re.match(r'^(Nom|Voc)$', node.feats['Case']): # on, ona, ono, oni, ony + self.check_adjective_like(node, ['PronType', 'Person'], { + 'PronType': ['Prs'], + 'Person': ['3'] + }) + elif re.match(r"^(ho|mu)$", node.form.lower()): + # The short (clitic) forms do not have PrepCase in Modern Czech. + # Old Czech has also 'jmu' (besides 'jemu' and 'mu') and 'jho' + # (besides 'jeho' and 'ho'); it should not have Variant=Short + # and it should have PrepCase=Npr (the next block). + self.check_adjective_like(node, ['PronType', 'Person', 'Variant'], { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Variant': ['Short'] + }) + else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně + # Mostly only two gender groups and no animacy: + # Masc,Neut ... jeho, jho, jemu, jmu, jej, něm, jím + # Fem ... jí, ji, ní + # Neut ... je + # No gender in dual and plural: + # Plur ... jich, jim, je, nich, jimi + # Here we require PrepCase but disallow Variant. + self.check_adjective_like(node, ['PronType', 'Person', 'PrepCase'], { + 'PronType': ['Prs'], + 'Person': ['3'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # 1st and 2nd person do not have gender: já, ty + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + elif re.search(r'k[dt][oe]', node.lemma): # kdo (kto), kdož, někdo, nikdo + # There is no Number. Někdo and nikdo behave like singular; + # kdo is by default singular as well but it also occurs as subject + # of plural verbs ("ti, kdo nepřišli včas, byli vyloučeni"). + # In Old Czech, "nikde" is a variant of the pronoun "nikdo" (nobody) + # (while in New Czech, "nikde" (nowhere) is a pronominal adverb only). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, kdo to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Int', 'Rel', 'Ind', 'Neg'], + 'Gender': ['Masc'], + 'Animacy': ['Anim'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif re.match(r'^(co|což|což?koliv?|něco|lečco|lecco|nic|nicož)$', node.lemma): + # Although these pronouns behave by default as neuter singular, + # no Gender and Number is annotated. However, quite unusually, + # there is Animacy=Inan without Gender. + ###!!! This should probably be fixed in all Czech treebanks and + ###!!! in Interset. The pronoun should get Gender=Neut and no + ###!!! animacy. For now, let's at least make animacy an optional + ###!!! feature (I see that we already do not fill it in the Old + ###!!! Czech data). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, co to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Int', 'Rel', 'Ind', 'Neg'], + 'Animacy': ['Inan'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif node.lemma == 'ješto': + # Unlike 'jenžto', this relative pronoun does not inflect, it + # always occurs in a nominative position, but the context can + # be any gender and number. + # Update from the Hičkok project: 'ješto' is lemmatized to + # 'jenžto' (see below), meaning that this branch should not be + # needed for the new data. + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Case': ['Nom'] + }) + elif re.match(r'^(jenž|jenžto)$', node.lemma): + # The relative pronouns 'jenž', 'jenžto' inflect for gender; + # while we normally take this as a sign of DET (instead of PRON), + # these can never act as real DET because they never modify a + # nominal. + # Similarly to the personal pronoun 'on', animacy is only + # annotated for masculine nominative plural, non-nominative + # forms are merged for masculine and neuter (jehož, jemuž), and + # non-singular gender is only annotated in nominative (while + # these cases are common for all genders: jichž, jimž, jimiž). + # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even + # in the nominative, although there is no prepositional counter- + # part (but similarly the locative has no prepositionless form). + # Update from the Hičkok project: In Old Czech, both 'jenž' and + # 'jenžto' (or its variant 'ješto') can be used uninflected, + # accompanied by a resumptive pronoun which provides the inflection. + # In this case, the Hičkok data will not annotate Gender, Animacy, + # Number and Case of the relative pronoun. Therefore, we require + # the full set of features if any of them is present; otherwise, + # we only expect PronType and PrepCase. + if node.feats['Gender'] != '' or node.feats['Animacy'] != '' or node.feats['Number'] != '' or node.feats['Case'] != '': + self.check_adjective_like(node, ['PronType', 'PrepCase'], { + 'PronType': ['Rel'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: + self.check_required_features(node, ['PronType', 'PrepCase']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'PrepCase': ['Npr'] + }) + else: + # What remains is the relative pronoun 'an'. It behaves similarly + # to 'jenž' but it does not have the PrepCase feature and it + # only occurs in the nominative. + if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': # ani + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Plur'], + 'Case': ['Nom'] + }) + else: # not Masc Plur: an, ana, ano, any + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom'] + }) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. + # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. + if re.match(r'^(je?ho|jejich|j[ií]ch)$', node.form.lower()): + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc', 'Neut', 'Masc,Neut'], + 'Gender': ['Masc', 'Fem', 'Neut'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Animacy': ['Anim', 'Inan'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Number': ['Sing', 'Dual', 'Plur'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified number by context + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] # uninflected in modern Czech, but old Czech annotations sometime indicate the case by context + # PrepCase is not allowed when it is a possessive determiner because no n-form can be used (jeho dům VS. na jeho dům). + # Compare with genitive/accusative of the pronoun "on", there the form changes after preposition and PrepCase must be annotated + # (jeho se bojím VS. bez něho se neobejdu). + }) + # Relative possessive determiners 'jehož' and 'jejichž' behave similarly + # to the personal possessive determiners but they do not have Person. + elif re.match(r'^(jeho|jejich|j[ií]ch)ž(e|to)?$', node.form.lower()): + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc', 'Neut', 'Masc,Neut'], + 'Gender': ['Masc', 'Fem', 'Neut'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Animacy': ['Anim', 'Inan'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Number': ['Sing', 'Dual', 'Plur'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified number by context + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] # uninflected in modern Czech, but old Czech annotations sometime indicate the case by context + # PrepCase is not allowed when it is a possessive determiner (muž, jehož manželka zahynula při nehodě) because no n-form can be used + # (after preposition: muž, na jehož manželku jste si stěžoval). Compare with genitive/accusative of the relative pronoun "jenž", + # there the form changes after preposition and PrepCase must be annotated (muž, jehož se bojím VS. muž, bez něhož se neobejdeme). + }) + # Feminine personal possessive determiner. + elif re.match(r'^(její|jeje|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)$', node.form.lower()): + # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. + # Congruent gender: + # - in PDT, only in singular; masculine and neuter are merged even in nominative + # - in Old Czech data, gender is disambiguated by context (no merging), even in dual and plural + # Case: + # - in PDT, not distinguished in feminine singular (její bota, její boty, její botě, její botu...) + # - in Old Czech data, distinguished always (and needed at least for 'jejiej') + if self.pdt20: + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc', 'Neut', 'Fem'], + 'Animacy': ['Anim', 'Inan'], # only for Gender=Masc + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # Feminine relative possessive determiner. + elif re.match(r'^(její|jeje|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(e|to)?)$', node.form.lower()): + # The feminine possessive 'jejíž' slightly inflects, unlike 'jehož' and 'jejichž'. + # Congruent gender: + # - in PDT, only in singular; masculine and neuter are merged even in nominative + # - in Old Czech data, gender is disambiguated by context (no merging), even in dual and plural + # Case: + # - in PDT, not distinguished in feminine singular (jejíž bota, jejíž boty, jejíž botě, jejíž botu...) + # - in Old Czech data, distinguished always (and needed at least for 'jejiejž') + if self.pdt20: + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc', 'Neut', 'Fem'], + 'Animacy': ['Anim', 'Inan'], # only for Gender=Masc + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(můj|tvůj|svůj)$', node.lemma): + if node.feats['Reflex'] == 'Yes': + self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'] + }) + else: + self.check_adjective_like(node, ['PronType', 'Poss', 'Person', 'Number[psor]'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['1', '2'], + 'Number[psor]': ['Sing', 'Plur'] + }) + elif re.match(r'^(ně|lec|ni)?číž?(koliv?)?$', node.lemma): + self.check_adjective_like(node, ['PronType', 'Poss'], { + 'PronType': ['Int', 'Rel', 'Ind', 'Neg'], + 'Poss': ['Yes'] + }) + elif re.match(r'^(sám|samý)$', node.lemma): + # The above condition looks at both lemma options, although only one lemma is assumed. + # However, in New Czech data the one lemma is "samý" while in Old Czech data it is "sám". + # Unlike other determiners, it allows Variant=Short: sám, sama, samu, samo, sami, samy. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Emp'], 'Variant': ['Short']}) + elif node.lemma == 'veškerý': + # In Old Czech, this determiner also allows Variant=Short: veškeren, veškera, veškeru, veškero, veškeři, veškery. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Tot'], 'Variant': ['Short']}) + elif node.lemma == 'žádný': + # In Old Czech, this determiner also allows Variant=Short: žáden, žádna, žádnu, žádno, žádni, žádny. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Neg'], 'Variant': ['Short']}) + elif node.feats['NumType'] == 'Card': # pronominal quantifiers 'mnoho', 'málo', 'několik' etc. + if node.lemma == 'nejeden': + self.check_adjective_like(node, ['PronType', 'NumType'], {'PronType': ['Ind'], 'NumType': ['Card']}) + else: + # Lemmas 'hodně' and 'málo' have Degree even if used as quantifiers and not adverbs: + # hodně, více, nejvíce; málo, méně, nejméně + # Lemmas 'mnoho' and 'málo' can be negated (nemnoho, nemálo). + self.check_required_features(node, ['PronType', 'NumType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Ind', 'Int', 'Rel', 'Dem'], + 'NumType': ['Card'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, kde to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Int', 'Rel', 'Ind', 'Neg', 'Tot']}) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'] + }) + else: + # 'jeden' has Gender, Animacy, Number, Case: jeden, jedna, jedno, jednoho, jednomu, jednom, jedním, jedné, jednu, jednou, jedni, jedny, jedněch, jedněm, jedněmi. + # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma. + # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi. + # 'pět' and more have Number=Plur, Case: pět, pěti. + # 'půl' has no Number and Case, although it behaves syntactically similarly to 'pět' (but genitive is still 'půl', not '*půli'). + # 'sto', 'tisíc', 'milión', 'miliarda' etc. have Gender (+ possibly Animacy) and Number (depending on their form). + if node.lemma == 'jeden': + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(dva|oba)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'Gender', 'Number', 'Case']) + if self.pdt20: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'oba' + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'oba' + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif node.lemma == 'půl': + self.check_required_features(node, ['NumType', 'NumForm']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'] + }) + elif re.match(r'^(sto|tisíc|.+ili[oó]n|.+iliarda)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card', 'Sets'], + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + # In PDT, cardinal numerals higher than four in nominative/accusative/vocative + # have Number=Sing instead of Plur! It may be motivated by the default + # agreement they trigger on verbs (but they don't have Gender=Neut). + # It does not make much sense but we must allow Sing before a better + # approach is defined and implemented in the data. + # On the other hand, we may want to allow Dual for "stě". + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card', 'Sets'], + 'NumForm': ['Word'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # VERBS AND AUXILIARIES ################################################ + elif node.upos in ['VERB', 'AUX']: + # There are only three lemmas recognized as AUX in Czech. This is not + # about features and it would be caught by the UD validator, but it + # is error in morphology, so let's report it here as well. + if node.upos == 'AUX' and node.lemma not in ['být', 'bývat', 'bývávat']: + self.bug(node, 'NonAuxLemma') + # All Czech verbs (and some adjectives and nouns) must have VerbForm. + # Almost all verbs have lexical Aspect but we cannot require it + # because there are a few biaspectual verbs (e.g. 'analyzovat') that + # do not have the feature. + self.check_required_features(node, ['VerbForm']) + if node.feats['VerbForm'] in ['Inf', 'Sup']: + # There is no voice. For some reason, PDT does not annotate that + # the infinitive form is active (while a passive infinitive is + # a combination of the infinitive with a passive participle). + self.check_required_features(node, ['Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Inf', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['VerbForm'] == 'Fin': + # Voice is optional. For some reason it is not annotated with + # imperatives (although passive imperatives are a combination + # of the active imperative and a passive participle). It is + # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'. + # Conditional "by" has no person and number (it is typically + # 3rd person but it could be other persons, too, as in "ty by + # ses bál"). + if node.feats['Mood'] == 'Cnd': + if node.form.lower() == 'by': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'] + }) + elif node.form.lower() == 'byšta': + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['2', '3'], + 'Number': ['Dual'] + }) + else: + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Dual', 'Plur'] + }) + elif node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Voice': ['Act'], # optional in Old Czech data, not used with imperatives in Modern Czech data (at least not yet) + 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person) + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'], + 'Emph': ['Yes'] + }) + else: # indicative + self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Voice': ['Act'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short', 'Long'] # distinguishes sigmatic (Long) and asigmatic (Short) aorist + }) + elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # converb + # Old Czech data annotate converb gender by context rather than form + # (because the form was different than in Modern Czech) and for + # masculines they also include animacy. In Modern Czech animacy is + # currently not annotated and Masc,Neut gender is merged. + if node.feats['Number'] == 'Sing': + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing'], + 'Gender': ['Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], + 'Number': ['Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['NumType'] != '': + # Adverbial multiplicative numerals (jednou, dvakrát, třikrát) + # belong here. They have also pronominal counterparts (kolikrát, + # tolikrát, několikrát). There are also adverbial ordinal numerals + # (zaprvé, poprvé, zadruhé, podruhé). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with question mark; indirect questions like "Ptal ses, kde to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_allowed_features(node, { + 'NumType': ['Mult', 'Ord'], + 'PronType': ['Dem', 'Int', 'Rel', 'Int,Rel', 'Ind'] + }) + elif self.pdt20: + if node.feats['PronType'] != '': + # Pronominal adverbs in PDT are neither compared nor negated. + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] + }) + elif node.feats['Degree'] != '': + # Adverbs that are compared can also be negated. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) + else: + if node.feats['PronType'] == 'Tot': + # Total adverbs in Old Czech can be negated: vždy, nevždy. + # Then for consistence with other adverbs, we also require + # Degree, although it will be always Pos. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'PronType': ['Tot'], + 'Degree': ['Pos'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['PronType'] != '': + # Other pronominal adverbs are neither compared nor negated. + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with question mark; indirect questions like "Ptal ses, kde to je?" use Rel.) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg'] + }) + else: + # All other adverbs should have both Degree and Polarity, + # although for some of them the values will always be Pos. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Emph': ['Yes'], + 'Abbr': ['Yes'] + }) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_required_features(node, ['AdpType', 'Case']) + self.check_allowed_features(node, { + 'AdpType': ['Prep', 'Voc'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'Abbr': ['Yes'] + }) + # SUBORDINATING CONJUNCTIONS ########################################### + elif node.upos == 'SCONJ': + self.check_allowed_features(node, { + 'Emph': ['Yes'] + }) + # COORDINATING CONJUNCTIONS ############################################ + elif node.upos == 'CCONJ': + self.check_allowed_features(node, { + 'Emph': ['Yes'] + }) + # PARTICLES ############################################################ + elif node.upos == 'PART': + # "t." = "totiž" + self.check_allowed_features(node, { + 'Abbr': ['Yes'] + }) + # THE REST: NO FEATURES ################################################ + # (OR UNDEFINED UPOS) ################################################## + else: + if not node.upos in ['INTJ', 'PUNCT', 'SYM', 'X']: + bugmsg = 'UnknownUpos' + if node.upos: + bugmsg += node.upos + self.bug(node, bugmsg) + self.check_allowed_features(node, {}) + + def check_adjective_like(self, node, r0, a0): + """ + Long form of adjectives, pronouns and determiners mostly share declension + paradigms and thus the sets of features that are expected. Whether the + actual feature sets are the same depends on the tagging convention (PDT + vs. ČNK): in PDT, adjectives are fully disambiguated while pronouns are + not; in ČNK, both adjectives and pronouns (incl. determiners) are fully + disambiguated. This method defines the core inflectional features while + any extras (such as PronType for pronouns) have to be provided by the + caller in parameters r0 (list) and a0 (dict). + """ + required_features = [] + allowed_features = {} + full_set = node.upos == 'ADJ' or not self.pdt20 + if full_set: + # Even in the full set, animacy is only distinguished for the + # masculine gender. + if node.feats['Gender'] == 'Masc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + # Gender is annotated in all cases in singular (ten, ta, to) + # but only in nominative, accusative, and vocative in plural + # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished + # in plural if gender is distinguished and it is masculine; in + # singular it is distinguished only in accusative (toho, ten). + # Other cases in plural are gender-less (těch, těm, těmi). + # Note that this is not consistent with adjectives, where we + # disambiguate gender in all cases in plural. + if node.feats['Number'] == 'Sing': + if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing'], + 'Case': ['Acc'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Acc', 'Voc'] + } + else: + required_features = ['Number', 'Case'] + allowed_features = { + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] + } + required_features = r0 + required_features + a0.update(allowed_features) + allowed_features = a0 + self.check_required_features(node, required_features) + self.check_allowed_features(node, allowed_features) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py new file mode 100644 index 00000000..e9367d46 --- /dev/null +++ b/udapi/block/ud/da/fixmultisubject.py @@ -0,0 +1,123 @@ +""" +Block ud.da.FixMultiSubject tries to fix some systemic instances of predicates +that have more than one subject dependent. +""" +from udapi.core.block import Block +import re + +class FixMultiSubject(Block): + """ + Make sure that a predicate has at most one subject. Note that it can + only fix instances that follow certain pattern observed in the Danish + data. + """ + + def process_node(self, node): + subjects = [x for x in node.children if re.match(r'^[nc]subj$', x.udeprel)] + if len(subjects) > 1: + # Pattern 1: A node is is attached as xcomp to the current node, and + # one of the subjects is closer to that xcomp than to the current + # node. + xcompchildren = [x for x in node.children if x.udeprel == 'xcomp'] + # Pattern 2: Similar to pattern 1, but advcl instead of xcomp, and + # possibly not so many other mis-attached dependents. + advclchildren = [x for x in node.children if x.udeprel == 'advcl'] + # Pattern 3: Instead of xcomp or advcl, there is a simple amod + # (under a verb!), in fact an adjective with a copula that should + # have been advcl. Alternatively, the nonverbal clause is headed + # by a noun, and the deprel is obl instead of amod. + amodchildren = [x for x in node.children if re.match(r'^(amod|obl)$', x.udeprel)] + if len(subjects) == 2 and len(xcompchildren) > 0: + for xcompnode in xcompchildren: + dn = [dist(node, x) for x in subjects] + dx = [dist(xcompnode, x) for x in subjects] + # Is the first subject closer to xcomp than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to xcomp? + if dx[0] <= dn[0] and dn[1] <= dx[1]: + # The first subject should be re-attached to the xcomp node. + subjects[0].parent = xcompnode + # There are typically other dependents that should belong to the xcomp node. + for c in node.children: + if c != xcompnode and dist(xcompnode, c) < dist(node, c): + c.parent = xcompnode + # The xcompnode should probably be attached as something else + # than xcomp, perhaps even the direction of the relation should + # be reversed, but one would have to resolve this manually. + xcompnode.misc['ToDo'] = 'check-xcomp' + break + # Is the second subject closer to xcomp than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to xcomp? + elif dx[1] <= dn[1] and dn[0] <= dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = xcompnode + # There are typically other dependents that should belong to the xcomp node. + for c in node.children: + if c != xcompnode and dist(xcompnode, c) < dist(node, c): + c.parent = xcompnode + # The xcompnode should probably be attached as something else + # than xcomp, perhaps even the direction of the relation should + # be reversed, but one would have to resolve this manually. + xcompnode.misc['ToDo'] = 'check-xcomp' + break + elif len(subjects) == 2 and len(advclchildren) > 0: + for advclnode in advclchildren: + dn = [dist(node, x) for x in subjects] + dx = [dist(advclnode, x) for x in subjects] + # Is the first subject closer to advcl than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to advcl? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the advcl node. + subjects[0].parent = advclnode + break + # Is the second subject closer to advcl than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to advcl? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = advclnode + break + elif len(subjects) == 2 and len(amodchildren) > 0: + for amodnode in amodchildren: + if len([x for x in amodnode.children if x.udeprel == 'cop']) > 0: + dn = [dist(node, x) for x in subjects] + dx = [dist(amodnode, x) for x in subjects] + # Is the first subject closer to amod than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to amod? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the advcl node. + subjects[0].parent = amodnode + amodnode.deprel = 'advcl' + # There are typically other dependents that should belong to the amod node. + for c in node.children: + if c != amodnode and dist(amodnode, c) < dist(node, c): + c.parent = amodnode + break + # Is the second subject closer to amod than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to amod? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = amodnode + amodnode.deprel = 'advcl' + # There are typically other dependents that should belong to the amod node. + for c in node.children: + if c != amodnode and dist(amodnode, c) < dist(node, c): + c.parent = amodnode + break + +def dist(x, y): + if x.ord < y.ord: + a = x + b = y + else: + a = y + b = x + d = b.ord - a.ord + # Count the commas between the two nodes. A comma should be seen as increasing + # the distance of the nodes, that is, decreasing the probability that they + # are in the same clause. + nc = 0 + for i in a.root.descendants: + if i.ord > a.ord and i.ord < b.ord: + if i.form == ',': + nc += 1 + d += nc * 10 + return d diff --git a/udapi/block/ud/de/addmwt.py b/udapi/block/ud/de/addmwt.py index 23ac54f9..18778a4a 100644 --- a/udapi/block/ud/de/addmwt.py +++ b/udapi/block/ud/de/addmwt.py @@ -16,15 +16,16 @@ 'durchs': {'form': 'durch das', }, 'fürs': {'form': 'fürs das', }, 'hinterm': {'form': 'hinter dem', }, + 'hinters': {'form': 'hinter das', }, 'im': {'form': 'in dem', }, 'ins': {'form': 'in das', }, 'übers': {'form': 'über das', }, 'ums': {'form': 'um das', }, - 'unters': {'form': 'unter das', }, 'unterm': {'form': 'unter dem', }, + 'unters': {'form': 'unter das', }, 'vom': {'form': 'von dem', }, - 'vors': {'form': 'vor das', }, 'vorm': {'form': 'vor dem', }, + 'vors': {'form': 'vor das', }, 'zum': {'form': 'zu dem', }, 'zur': {'form': 'zu der', }, } diff --git a/udapi/block/ud/de/fixgsd.py b/udapi/block/ud/de/fixgsd.py new file mode 100644 index 00000000..65d12681 --- /dev/null +++ b/udapi/block/ud/de/fixgsd.py @@ -0,0 +1,58 @@ +""" +Block to fix annotation of UD German-GSD. +""" +from udapi.core.block import Block +import logging +import re + +class FixGSD(Block): + + def process_node(self, node): + """ + Normalizes tokenization, lemmatization and tagging of ordinal numerals + that are expressed using digits followed by a period. + https://github.com/UniversalDependencies/UD_German-GSD/issues/24 + """ + # Ignore periods that terminate a sentence, although they could belong + # to an ordinal numeral at the same time. + if node.form == '.' and node.next_node: + # Ignore number+period combinations that have an intervening space. + if node.prev_node and re.match(r'^\d+$', node.prev_node.form) and node.prev_node.no_space_after: + # Merge the number and the period into one token. + number = node.prev_node + period = node + # The period should not have any children but if it does, re-attach them to the number. + for c in period.children: + c.parent = number + # The period should be followed by a space but if it isn't, mark it at the number. + number.misc['SpaceAfter'] = 'No' if period.no_space_after else '' + number.form += '.' + number.lemma = number.form + number.upos = 'ADJ' + number.xpos = 'ADJA' + number.feats = '_' + number.feats['NumType'] = 'Ord' + if number.udeprel == 'nummod': + number.deprel = 'amod' + period.remove() + # Even if the digits and the period are already in one token, check their annotation. + if re.match(r'^\d+\.$', node.form): + node.lemma = node.form + node.upos = 'ADJ' + node.xpos = 'ADJA' + node.feats = '_' + node.feats['NumType'] = 'Ord' + if node.udeprel == 'nummod': + node.deprel = 'amod' + # Finally, make sure that ordinal numerals expressed verbosely are tagged properly. + # Unlike for digits, do not remove the features for Gender, Number, and Case. + # Skip 'acht' because we cannot reliably distinguish it from the cardinal numeral and from the verb 'achten'. + if re.match(r'^(erst|zweit|dritt|viert|fünft|sechst|siebt|neunt|(drei|vier|fünf|sechs|sieb|acht|neun)?zehnt|elft|zwölft)(er)?$', node.lemma, re.IGNORECASE): + # Skip 'erst' that is used as an adverb. + if node.lemma != 'erst' or node.upos != 'ADV': + node.lemma = re.sub(r'^(.+)er$', r'\1', node.lemma) + node.upos = 'ADJ' + node.xpos = 'ADJA' + node.feats['NumType'] = 'Ord' + if node.udeprel == 'nummod': + node.deprel = 'amod' diff --git a/udapi/block/ud/de/fixhdt.py b/udapi/block/ud/de/fixhdt.py new file mode 100644 index 00000000..a3792a96 --- /dev/null +++ b/udapi/block/ud/de/fixhdt.py @@ -0,0 +1,109 @@ +""" +Block to fix annotation of UD German-HDT. + +It was created independently of ud.de.AddMwt but it aims to do essentially the +same thing. Future work: make the two blocks converge. + +Currently known differences: +- This block covers a wider range of contractions. +- This block generates morphological features for the syntactic words. +- This block does not touch words that look like contractions but do not have PronType=Art (this is a reliable indicator in HDT). +- This block overrides the default attachment when the original relation is root, conj, reparandum. +- The other block takes advantage of the generic class ud.AddMwt, so it does not have to re-invent common procedures. +""" +from udapi.core.block import Block +import logging +import re + +class FixHDT(Block): + + def process_node(self, node): + # PronType=Art with ADP is wrong. Fused prepositions and articles should be decomposed in UD. + # The following contractions have been observed: + # a. am ans aufs beim durchs fürs hinterm hinters im ins übers ums unterm unters vom vorm vors z. zum zur + if node.upos == 'ADP' and node.feats['PronType'] == 'Art': + if re.match("^(a\.|am|ans|aufs|beim|durchs|fürs|hinter[ms]|im|ins|übers|ums|unter[ms]|vom|vor[ms]|z\.|zu[mr])$", node.form, re.IGNORECASE): + # We need two nodes instead of one. Create a node. + # The parent should not be the root but unfortunately it is not guaranteed. + node2 = node.create_child() + node2.shift_after_node(node) + if not re.match(r"^(root|conj|reparandum)$", node.udeprel): + node2.parent = node.parent + node.deprel = 'case' + node2.deprel = 'det' + mwt = node.root.create_multiword_token(form=node.form, words=[node, node2], misc=node.misc) + node.misc['SpaceAfter'] = '' + # We want to respect the original letter case in the forms of the syntactic words. + # We can use the isupper() method to find out whether all letters are uppercase. + # However, detecting first-letter capitalization requires more work. + up = 2 if mwt.form.isupper() else 1 if mwt.form[:1].isupper() else 0 + up2 = 2 if up == 2 else 0 + if re.match(r"^(a\.|am|ans)$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'an') + node.lemma = 'an' + elif re.match(r"^aufs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'auf') + node.lemma = 'auf' + elif re.match(r"^beim$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'bei') + node.lemma = 'bei' + elif re.match(r"^durchs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'durch') + node.lemma = 'durch' + elif re.match(r"^fürs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'für') + node.lemma = 'für' + elif re.match(r"^hinter[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'hinter') + node.lemma = 'hinter' + elif re.match(r"^(im|ins)$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'in') + node.lemma = 'in' + elif re.match(r"^übers$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'über') + node.lemma = 'über' + elif re.match(r"^ums$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'um') + node.lemma = 'um' + elif re.match(r"^unter[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'unter') + node.lemma = 'unter' + elif re.match(r"^vom$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'von') + node.lemma = 'von' + elif re.match(r"^vor[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'vor') + node.lemma = 'vor' + elif re.match(r"^(z\.|zu[mr])$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'zu') + node.lemma = 'zu' + node.upos = 'ADP' + node.xpos = 'APPR' + node.feats = '_' + node.feats['AdpType'] = 'Prep' + # We must use search() because match() only checks at the beginning of the string. + if re.search("[m\.]$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'dem') + node2.feats = 'Case=Dat|Definite=Def|Gender=Masc,Neut|Number=Sing|PronType=Art' + node.feats['Case'] = 'Dat' + node2.lemma = 'der' + elif re.search("s$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'das') + node2.feats = 'Case=Acc|Definite=Def|Gender=Neut|Number=Sing|PronType=Art' + node.feats['Case'] = 'Acc' + node2.lemma = 'der' + elif re.search("r$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'der') + node2.feats = 'Case=Dat|Definite=Def|Gender=Fem|Number=Sing|PronType=Art' + node.feats['Case'] = 'Dat' + node2.lemma = 'der' + node2.upos = 'DET' + node2.xpos = 'ART' + +def mimic_case(up, x): + if up >= 2: + return x.upper() + elif up == 1: + return x[:1].upper() + x[1:].lower() + else: + return x.lower() diff --git a/udapi/block/ud/el/addmwt.py b/udapi/block/ud/el/addmwt.py index 8381c69f..ac753ed5 100644 --- a/udapi/block/ud/el/addmwt.py +++ b/udapi/block/ud/el/addmwt.py @@ -8,13 +8,13 @@ import udapi.block.ud.addmwt MWTS = { - 'στη': {'form': 'σ τη', 'feats': '_ Case=Acc|Gender=Fem|Number=Sing'}, - 'στην': {'form': 'σ την', 'feats': '_ Case=Acc|Gender=Fem|Number=Sing'}, - 'στα': {'form': 'σ τα', 'feats': '_ Case=Acc|Gender=Neut|Number=Plur'}, - 'στους': {'form': 'σ τους', 'feats': '_ Case=Acc|Gender=Masc|Number=Plur'}, - 'στις': {'form': 'σ τις', 'feats': '_ Case=Acc|Gender=Fem|Number=Plur'}, - 'στον': {'form': 'σ τον', 'feats': '_ Case=Acc|Gender=Masc|Number=Sing'}, - 'στο': {'form': 'σ το', 'feats': '_ Case=Acc|Gender=*|Number=Sing'}, + 'στη': {'form': 'σ τη', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'}, + 'στην': {'form': 'σ την', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'}, + 'στα': {'form': 'σ τα', 'feats': '_ Case=Acc|Definite=Def|Gender=Neut|Number=Plur|PronType=Art'}, + 'στους': {'form': 'σ τους', 'feats': '_ Case=Acc|Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, + 'στις': {'form': 'σ τις', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Plur|PronType=Art'}, + 'στον': {'form': 'σ τον', 'feats': '_ Case=Acc|Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'στο': {'form': 'σ το', 'feats': '_ Case=Acc|Definite=Def|Gender=*|Number=Sing|PronType=Art'}, } # shared values for all entries in MWTS diff --git a/udapi/block/ud/es/addmwt.py b/udapi/block/ud/es/addmwt.py index ee85b1d6..92f80160 100644 --- a/udapi/block/ud/es/addmwt.py +++ b/udapi/block/ud/es/addmwt.py @@ -1,6 +1,6 @@ """Block ud.es.AddMwt for heuristic detection of Spanish contractions. -According to the UD guidelines, contractions such as "dele" = "de ele" +According to the UD guidelines, contractions such as "del" = "de el" should be annotated using multi-word tokens. Note that this block should be used only for converting legacy conllu files. @@ -28,7 +28,7 @@ v['lemma'] = v['form'] v['upos'] = 'ADP DET' v['deprel'] = '* det' - v['feats'] = '_ *' + v['feats'] = '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art' # The following are the default values # v['main'] = 0 # which of the two words will inherit the original children (if any) # v['shape'] = 'siblings', # the newly created nodes will be siblings @@ -46,6 +46,11 @@ def multiword_analysis(self, node): analysis = MWTS.get(node.form.lower(), None) if analysis is not None: + # Modify the default attachment of the new syntactic words in special situations. + if re.match(r'^(root|conj|reparandum)$', node.udeprel): + # Copy the dictionary so that we do not modify the original and do not affect subsequent usages. + analysis = analysis.copy() + analysis['shape'] = 'subtree' return analysis if not self.verbpron or node.upos not in {'VERB', 'AUX'}: diff --git a/udapi/block/ud/es/elque.py b/udapi/block/ud/es/elque.py new file mode 100644 index 00000000..4d14b98d --- /dev/null +++ b/udapi/block/ud/es/elque.py @@ -0,0 +1,116 @@ +""" +This block searches for relative clauses modifying a determiner ('el que, el cual...'). +It is written for Spanish but a similar block should work for other Romance +languages. +""" +from udapi.core.block import Block +import logging +import re + +class ElQue(Block): + + def __init__(self, fix=False, **kwargs): + """ + Default: Print the annotation patterns but do not fix anything. + fix=1: Do not print the patterns but fix them. + """ + super().__init__(**kwargs) + self.fix = fix + + def process_node(self, node): + # We take 'que' as the central node of the construction. + if re.match(r'^(que|cual)$', node.lemma) and node.upos == 'PRON' and node.parent.ord > node.ord: + # We will refer to the parent of 'que' as a verb, although it can be + # a non-verbal predicate, too. + que = node + verb = node.parent + # Check the lemma of the determiner. The form may vary for gender and number. + if que.prev_node and que.prev_node.lemma == 'el': + el = que.prev_node + adp = None + if el.prev_node and el.prev_node.upos == 'ADP': + adp = el.prev_node + if adp.udeprel == 'fixed': + adp = adp.parent + if self.fix: + self.fix_pattern(adp, el, que, verb) + else: + self.print_pattern(adp, el, que, verb) + + def print_pattern(self, adp, el, que, verb): + stanford = [] + if adp: + if adp.parent == el: + parentstr = 'el' + elif adp.parent == que: + parentstr = 'que' + elif adp.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(adp.deprel + '(' + parentstr + ', ADP)') + if el.parent == adp: + parentstr = 'ADP' + elif el.parent == que: + parentstr = 'que' + elif el.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(el.deprel + '(' + parentstr + ', el)') + # We found the verb as the parent of 'que', so we do not need to check the parent of 'que' now. + stanford.append(que.deprel + '(VERB, que)') + if verb.parent == adp: + parentstr = 'ADP' + elif verb.parent == el: + parentstr = 'el' + else: + parentstr = 'OTHER' + stanford.append(verb.deprel + '(' + parentstr + ', VERB)') + print('; '.join(stanford)) + + def fix_pattern(self, adp, el, que, verb): + if adp: + if adp.parent == que or adp.parent == verb: + attach(adp, el, 'case') + if el.parent == que: + ###!!! Just a temporary change. In the end it will be attached elsewhere. + attach(el, verb) + el.parent = verb + if len(el.deps) == 1: + el.deps[0]['parent'] = verb + if verb.parent != adp and verb.parent != el and verb.parent != que: + eldeprel = None + if re.match(r'^[nc]subj$', verb.udeprel): + eldeprel = 'nsubj' + elif re.match(r'^ccomp$', verb.udeprel): + eldeprel = 'obj' + elif re.match(r'^advcl$', verb.udeprel): + eldeprel = 'obl' + elif re.match(r'^acl$', verb.udeprel): + eldeprel = 'nmod' + elif re.match(r'^(xcomp|conj|appos|root)$', verb.udeprel): + eldeprel = verb.deprel + if eldeprel: + attach(el, verb.parent, eldeprel) + attach(verb, el, 'acl:relcl') + # If anything before 'el' depends on the verb ('cc', 'mark', 'punct' etc.), + # re-attach it to 'el'. + for c in verb.children: + if c.ord < el.ord and re.match(r'^(cc|mark|case|punct)$', c.udeprel): + attach(c, el) + +def attach(node, parent, deprel=None): + """ + Attach a node to a new parent with a new deprel in the basic tree. In + addition, if there are enhanced dependencies and there is just one incoming + enhanced relation (this is the case in AnCora), this relation will be + modified accordingly. + """ + node.parent = parent + if deprel: + node.deprel = deprel + if len(node.deps) == 1: + node.deps[0]['parent'] = parent + if deprel: + node.deps[0]['deprel'] = deprel diff --git a/udapi/block/ud/es/fixexclamation.py b/udapi/block/ud/es/fixexclamation.py new file mode 100644 index 00000000..7dea8e0d --- /dev/null +++ b/udapi/block/ud/es/fixexclamation.py @@ -0,0 +1,47 @@ +"""Block to fix tokenization of exclamation marks in UD Spanish-AnCora.""" +from udapi.core.block import Block +import logging +import re + +class FixExclamation(Block): + + def process_node(self, node): + """ + In Spanish AnCora, there are things like '¡Hola!' as one token. + The punctuation should be separated. One may question whether this + should include names of companies (Yahoo!) or products (la revista + Hello!) but it should, as company and product names often have + multiple tokens (even multiple full words, not just punctuation) + and these are also separated in UD. + """ + if re.search(r'^[¡!]\w', node.form): + # Separate the punctuation and attach it to the rest. + punct = node.create_child() + punct.shift_before_node(node) + punct.form = node.form[:1] + node.form = node.form[1:] + punct.lemma = punct.form + punct.upos = 'PUNCT' + punct.xpos = 'faa' if punct.form == '¡' else 'fat' + punct.feats['PunctType'] = 'Excl' + punct.feats['PunctSide'] = 'Ini' if punct.form == '¡' else 'Fin' + punct.misc['SpaceAfter'] = 'No' + punct.deprel = 'punct' + # Mark the position for manual check. + node.misc['Mark'] = 'PunctSep' + if re.search(r'\w[¡!]$', node.form): + # Separate the punctuation and attach it to the rest. + punct = node.create_child() + punct.shift_after_node(node) + punct.form = node.form[-1:] + node.form = node.form[:-1] + punct.lemma = punct.form + punct.upos = 'PUNCT' + punct.xpos = 'faa' if punct.form == '¡' else 'fat' + punct.feats['PunctType'] = 'Excl' + punct.feats['PunctSide'] = 'Ini' if punct.form == '¡' else 'Fin' + punct.misc['SpaceAfter'] = node.misc['SpaceAfter'] + node.misc['SpaceAfter'] = 'No' + punct.deprel = 'punct' + # Mark the position for manual check. + node.misc['Mark'] = 'PunctSep' diff --git a/udapi/block/ud/es/fixtenerque.py b/udapi/block/ud/es/fixtenerque.py new file mode 100644 index 00000000..62fa0f4d --- /dev/null +++ b/udapi/block/ud/es/fixtenerque.py @@ -0,0 +1,47 @@ +"""Block to fix spurious auxiliary verbs in UD Spanish-AnCora.""" +from udapi.core.block import Block +import logging +import re + +class FixTenerQue(Block): + + def process_node(self, node): + """ + Some Spanish treebanks treat the verb 'tener' in constructions such as + 'tener que comer' as auxiliary. This is wrong and the validator will + flag it as an error. This block fixes such annotations. + + EDIT: 'ir a comer' is processed the same way. + """ + if re.match(r'^(tener|ir)$', node.lemma) and node.upos == 'AUX': + node.upos = 'VERB' + # In rare cases the auxiliary may have been promoted due to ellipsis. + # Most of the time however, it is attached as 'aux' to the main verb. + if node.udeprel == 'aux': + mainverb = node.parent + self.reattach(node, mainverb.parent, mainverb.deprel) + self.reattach(mainverb, node, 'xcomp') + # Some children of the former main verb should be reattached to 'tener'. + # Others (especially a direct object) should stay with the former main verb. + for c in mainverb.children: + if not re.match(r'^(obj|iobj|obl|ccomp|xcomp|conj|list|compound|flat|fixed|goeswith|reparandum)$', c.udeprel): + self.reattach(c, node, c.deprel) + # On the other hand, the conjunction 'que' may have been wrongly attached as 'fixed' to 'tener'. + for c in node.children: + if re.match(r'^(que|a)$', c.form.lower()) and c.ord > node.ord and c.ord < mainverb.ord: + self.reattach(c, mainverb, 'mark') + + def reattach(self, node, parent, deprel): + """ + Changes the incoming dependency relation to a node. Makes sure that the + same change is done in the basic tree and in the enhanced graph. + """ + if node.deps: + # If the enhanced graph contains the current basic relation, remove it. + orig_n_deps = len(node.deps) + node.deps = [x for x in node.deps if x['parent'] != node.parent or re.sub(r':.*', '', x['deprel']) != node.udeprel] + # Add the new basic relation to the enhanced graph only if the original one was there. + if len(node.deps) < orig_n_deps: + node.deps.append({'parent': parent, 'deprel': deprel}) + node.parent = parent + node.deprel = deprel diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py new file mode 100644 index 00000000..643ecd7c --- /dev/null +++ b/udapi/block/ud/es/fixverbfeats.py @@ -0,0 +1,38 @@ +"""Block to fix features (and potentially lemmas) of verbs in UD Spanish-PUD.""" +from udapi.core.block import Block +import logging +import re + +class FixVerbFeats(Block): + + def process_node(self, node): + """ + The features assigned to verbs in Spanish PUD are often wrong, although + the annotation was (reportedly) done manually. For example, infinitives + are tagged with VerbForm=Fin instead of VerbForm=Inf. + """ + if re.match(r'^(VERB|AUX)$', node.upos): + if re.search(r'[aei]r$', node.form, re.IGNORECASE): + # The infinitive has no features other than VerbForm. + node.feats = {} + node.feats['VerbForm'] = 'Inf' + node.lemma = node.form.lower() + elif re.search(r'ndo$', node.form, re.IGNORECASE): + if node.form.lower() != 'entiendo': + # The gerund has no features other than VerbForm. + # The lemma is not always straightforward but we have fixed it manually. + node.feats = {} + node.feats['VerbForm'] = 'Ger' + elif re.search(r'([ai]d|biert|dich|fech|hech|muert|puest|vist)[oa]s?$', node.form, re.IGNORECASE): + # The (past) participle has always Gender and Number. + # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). + # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) + gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem') + number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing') + node.feats = {} + node.feats['VerbForm'] = 'Part' + node.feats['Tense'] = 'Past' + node.feats['Gender'] = gender + node.feats['Number'] = number + if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): + node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower()) diff --git a/udapi/block/ud/fixadvmodbyupos.py b/udapi/block/ud/fixadvmodbyupos.py new file mode 100644 index 00000000..916910b5 --- /dev/null +++ b/udapi/block/ud/fixadvmodbyupos.py @@ -0,0 +1,87 @@ +""" +Block ud.FixAdvmodByUpos will change the dependency relation from advmod to something else +if the UPOS is not ADV. +""" +from udapi.core.block import Block + + +class FixAdvmodByUpos(Block): + """ + Make sure advmod is not used with UPOS it should not be used with. + """ + + def process_node(self, node): + if node.udeprel == 'advmod': + if node.upos in ['NOUN', 'PROPN', 'PRON', 'DET', 'NUM']: + node.deprel = 'obl' + elif node.upos == 'VERB': + node.deprel = 'advcl' + elif node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos in ['ADP', 'SCONJ']: + if node.parent.upos == 'VERB': + node.deprel = 'mark' + else: + node.deprel = 'case' + elif node.upos == 'CCONJ': + node.deprel = 'cc' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + else: + node.deprel = 'dep' + ###!!! The following are not advmod so they should probably have their + ###!!! own block or this block should have a different name. + elif node.udeprel == 'expl': + if node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'ADP': + node.deprel = 'case' + elif node.upos == 'ADV': + node.deprel = 'advmod' + elif node.upos == 'CCONJ': + node.deprel = 'cc' + elif node.udeprel in ['aux', 'cop']: + if node.upos != 'AUX': + node.deprel = 'dep' + elif node.udeprel == 'case': + if node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'PRON': + node.deprel = 'nmod' + elif node.udeprel == 'mark': + if node.upos in ['PRON', 'DET']: + node.deprel = 'nsubj' # it could be also obj, iobj, obl or nmod; just guessing what might be more probable + elif node.upos == 'NOUN': + node.deprel = 'obl' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + elif node.udeprel == 'cc': + if node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + elif node.udeprel == 'det': + if node.upos == 'NOUN': + node.deprel = 'nmod' + elif node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'ADV': + node.deprel = 'advmod' + elif node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'VERB': + node.deprel = 'dep' + elif node.upos == 'SCONJ': + node.deprel = 'mark' + elif node.upos == 'X': + node.deprel = 'dep' + elif node.udeprel == 'nummod': + if node.upos == 'PRON': + node.deprel = 'nmod' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.udeprel == 'punct': + if node.upos != 'PUNCT': + node.deprel = 'dep' diff --git a/udapi/block/ud/fixcompoundname.py b/udapi/block/ud/fixcompoundname.py new file mode 100644 index 00000000..90596e35 --- /dev/null +++ b/udapi/block/ud/fixcompoundname.py @@ -0,0 +1,46 @@ +""" +Block ud.FixCompoundName finds compound relations between PROPN nodes and converts +them to flat:name. This is not necessarily correct in all situations. The difference +between compound and flat is that compound allows to distinguish head and modifier. +Multiword person names (given name and surname, or various other patterns) typically +should be analyzed as flat but there are treebanks that incorrectly use compound +for person names. This block can be used to fix them. +""" +from udapi.core.block import Block +import regex as re +import logging + + +class FixCompoundName(Block): + """ + Converts a compound relation between two PROPN nodes into a flat relation. + Compounds of a PROPN and a non-PROPN will be left alone, although they are + suspicious, too. + """ + + def process_node(self, node): + if node.upos == 'PROPN' and node.udeprel == 'compound' and node.parent.upos == 'PROPN': + origparent = node.parent + grandparent = origparent.parent + outdeprel = origparent.deprel + # See if there are other PROPN compound siblings. + # (The list node.children is automatically sorted by ord. If any new sorting is needed later, we can compare nodes directly, their default comparison value is ord.) + namewords = [x for x in origparent.children(add_self=True) if x.upos == 'PROPN' and (x.udeprel == 'compound' or x == origparent)] + # The Hindi treebank tags dates (['30', 'navaṁbara'], ['disaṁbara', '1993']) as PROPN compounds. + # This is wrong but it is also different from personal names we are targeting here. + # Hence, we will skip "names" that contain numbers. + if any(re.search(r"\d", x.form) for x in namewords): + #logging.info(str([x.misc['Translit'] for x in namewords])) + ###!!! We currently cannot transform enhanced dependencies. + ###!!! If we proceed, the basic tree would diverge from the enhanced dependencies. + if len(node.deps) > 0: + logging.fatal('There are enhanced dependencies but ud.FixCompoundName has been implemented only for basic dependencies.') + # The first name word will be the technical head. If it is the current parent, fine. + head = namewords[0] + rest = namewords[1:] + if head != origparent: + head.parent = grandparent + head.deprel = outdeprel + for n in rest: + n.parent = head + n.deprel = 'flat:name' diff --git a/udapi/block/ud/fixleaf.py b/udapi/block/ud/fixleaf.py new file mode 100644 index 00000000..9b4ce191 --- /dev/null +++ b/udapi/block/ud/fixleaf.py @@ -0,0 +1,42 @@ +""" +Block ud.FixLeaf checks that function word dependents are leaves. +Certain known exceptions are observed (e.g., fixed expressions). +""" +from udapi.core.block import Block +import logging +import re + +class FixLeaf(Block): + """ + Make sure that function words are leaves unless one of the known exceptions + applies. + """ + + def __init__(self, deprels='aux,cop,case,mark,cc', **kwargs): + """ + Args: + deprels: comma-separated list of deprels to be fixed. Default = aux,cop,case,mark,cc. + """ + super().__init__(**kwargs) + self.deprels = deprels.split(',') + + def process_node(self, node): + for deprel in self.deprels: + if node.udeprel == deprel: + # Every function dependent can have a fixed child. + # We will also allow conj, cc, punct, goeswith, reparandum. + allowed = ['fixed', 'punct', 'goeswith', 'reparandum'] + if deprel != 'cc': + allowed += ['conj', 'cc'] + children = [c for c in node.children if not (c.udeprel in allowed)] + # Re-attach the remaining children to an acceptable ancestor. + ancestor = node.parent + while ancestor.udeprel in self.deprels: + ancestor = ancestor.parent + for c in children: + c.parent = ancestor + # If there are enhanced dependencies, check whether we want to redirect them too. + if c.deps: + for edep in c.deps: + if edep['parent'] == node: + edep['parent'] = ancestor diff --git a/udapi/block/ud/fixmultisubjects.py b/udapi/block/ud/fixmultisubjects.py new file mode 100644 index 00000000..f8aeca06 --- /dev/null +++ b/udapi/block/ud/fixmultisubjects.py @@ -0,0 +1,23 @@ +""" +Block ud.FixMultiSubjects will ensure that no node has more than one subject child (except those +marked as :outer). +""" +import re +from udapi.core.block import Block + + +class FixMultiSubjects(Block): + """ + Make sure there is at most one subject that is not marked as :outer. + """ + + def process_node(self, node): + subjects = [x for x in node.children if re.match(r"^[nc]subj(:|$)", x.deprel) and not re.search(r":outer$", x.deprel)] + # For the moment, we take the dummiest approach possible: The first subject survives and all others are forced to a different deprel. + if len(subjects) > 1: + subjects = subjects[1:] + for s in subjects: + if re.match(r"^n", s.deprel): + s.deprel = 'obl' + else: + s.deprel = 'advcl' diff --git a/udapi/block/ud/fixmwtspace.py b/udapi/block/ud/fixmwtspace.py new file mode 100644 index 00000000..a2b7b875 --- /dev/null +++ b/udapi/block/ud/fixmwtspace.py @@ -0,0 +1,22 @@ +""" +Block ud.FixMwtSpace looks for multiword tokens whose form contains a space, +which should be avoided. If found, the block checks whether it can remove +the multiword token seamlessly, that is, whether the syntactic words correspond +to the space-delimited parts of the multiword token. If possible, the MWT +line will be removed. +""" +from udapi.core.block import Block +import re + + +class FixMwtSpace(Block): + """Try to remove multiword tokens with spaces.""" + + def process_node(self, node): + if node.multiword_token: + mwt = node.multiword_token + if re.search(r' ', mwt.form): + if node == mwt.words[0]: + wordforms = [x.form for x in mwt.words] + if ' '.join(wordforms) == mwt.form: + mwt.remove() diff --git a/udapi/block/ud/fixpseudocop.py b/udapi/block/ud/fixpseudocop.py index ab07eaaa..f4d9a1ec 100644 --- a/udapi/block/ud/fixpseudocop.py +++ b/udapi/block/ud/fixpseudocop.py @@ -2,38 +2,44 @@ but they should be treated as normal verbs (with secondary predication) instead.""" from udapi.core.block import Block -import logging import re class FixPseudoCop(Block): - def __init__(self, lemma, **kwargs): + def __init__(self, lemmas, noncopaux=False, **kwargs): """Create the ud.FixPseudoCop block instance. Args: - lemma: the lemma of the pseudocopula that should be fixed + lemmas: comma-separated list of lemmas of the pseudocopulas that should be fixed + noncopaux: do the same for non-copula auxiliaries with the given lemma """ super().__init__(**kwargs) - self.lemma = lemma + self.lemmas = lemmas.split(',') + self.noncopaux = noncopaux def process_node(self, node): - pseudocop = self.lemma - if node.lemma == pseudocop and node.udeprel == "cop": - secpred = node.parent - grandparent = secpred.parent - node.parent = grandparent - node.deprel = secpred.deprel - secpred.parent = node - secpred.deprel = "xcomp" - ###!!! We should also take care of DEPS if they exist. - # As a copula, the word was tagged AUX. Now it should be VERB. - node.upos = "VERB" - # Examine the children of the original parent. - # Those that modify the clause should be re-attached to me. - # Those that modify the word (noun, adjective) should stay there. - for c in secpred.children: - # obl is borderline. It could modify an adjective rather than a clause. - # obj and iobj should not occur in copular clauses but it sometimes - # occurs with pseudocopulas: "I declare him handsome." - if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): - c.parent = node + pseudocop = self.lemmas + if node.lemma in pseudocop: + # Besides spurious copulas, this block can be optionally used to fix spurious auxiliaries (if noncopaux is set). + if node.udeprel == 'cop' or self.noncopaux and node.udeprel == 'aux': + secpred = node.parent + grandparent = secpred.parent + node.parent = grandparent + node.deprel = secpred.deprel + secpred.parent = node + secpred.deprel = "xcomp" + ###!!! We should also take care of DEPS if they exist. + # As a copula, the word was tagged AUX. Now it should be VERB. + node.upos = "VERB" + # Examine the children of the original parent. + # Those that modify the clause should be re-attached to me. + # Those that modify the word (noun, adjective) should stay there. + for c in secpred.children: + # obl is borderline. It could modify an adjective rather than a clause. + # obj and iobj should not occur in copular clauses but it sometimes + # occurs with pseudocopulas: "I declare him handsome." + if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): + c.parent = node + # Another possible error is that the word is tagged AUX without being attached as "cop" or "aux". + elif self.noncopaux and node.upos == 'AUX': + node.upos = 'VERB' diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index cc34a0d0..854a24a8 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -2,7 +2,7 @@ Punctuation in Universal Dependencies has the tag PUNCT, dependency relation punct, and is always attached projectively, usually to the head of a neighboring subtree -to its left or right. +to its left or right (see https://universaldependencies.org/u/dep/punct.html). Punctuation normally does not have children. If it does, we will fix it first. This block tries to re-attach punctuation projectively and according to the guidelines. @@ -27,15 +27,15 @@ '{': '}', '"': '"', # ASCII double quotes "'": "'", # ASCII single quotes - '“': '”', # quotation marks used in English,... - '„': '“', # Czech, German, Russian,... - '«': '»', # French, Russian, Spanish,... + '“': '”', # quotation marks used in English, ... + '„': '“', # Czech, German, Russian, ... + '«': '»', # French, Russian, Spanish, ... '‹': '›', # dtto '《': '》', # Korean, Chinese '「': '」', # Chinese, Japanese - '『': '』', # dtto - '¿': '?', # Spanish question quotation marks - '¡': '!', # Spanish exclamation quotation marks + '『': '』', # ditto + '¿': '?', # Spanish paired question marks + '¡': '!', # Spanish paired exclamation marks } FINAL_PUNCT = '.?!' @@ -50,22 +50,34 @@ def __init__(self, check_paired_punct_upos=False, copy_to_enhanced=False, **kwar Args: check_paired_punct_upos: fix paired punctuation tokens only if their UPOS=PUNCT. The default is false, which means that fixed punctuation is detected only - based on the form with the exception of single quote / apostrophe character, - which is frequently ambiguous, so UPOS=PUNCT is checked always. - copy_to_enhanced: for all PUNCT nodes, let the enhanced depencies be the same - as the basic dependencies. + based on the form with the exception of single & double quote character, + which is frequently ambiguous*, so UPOS=PUNCT is checked always. + *) Single quote can be an apostrophe. Double quote as a NOUN can be the inch symbol. + copy_to_enhanced: for all upos=PUNCT, let the enhanced depencies + be the same as the basic dependencies. """ super().__init__(**kwargs) self._punct_type = None self.check_paired_punct_upos = check_paired_punct_upos self.copy_to_enhanced = copy_to_enhanced + def _is_punct(self, node): + if node.upos == 'PUNCT': + return True + if self.check_paired_punct_upos: + return False + if node.form in "'\"": + return False + if node.form in PAIRED_PUNCT or node.form in PAIRED_PUNCT.values(): + return True + return False + def process_tree(self, root): # First, make sure no PUNCT has children. # This may introduce multiple subroots, which will be fixed later on # (preventing to temporarily create multiple subroots here would prevent fixing some errors). for node in root.descendants: - while node.parent.upos == "PUNCT": + while self._is_punct(node.parent): node.parent = node.parent.parent # Second, fix paired punctuations: quotes and brackets, marking them in _punct_type. @@ -77,16 +89,15 @@ def process_tree(self, root): self._punct_type = [None] * (1 + len(root.descendants)) for node in root.descendants: if self._punct_type[node.ord] != 'closing': - closing_punct = PAIRED_PUNCT.get(node.form, None) + closing_punct = PAIRED_PUNCT.get(node.form) if closing_punct is not None: self._fix_paired_punct(root, node, closing_punct) # Third, fix subordinate punctuation (i.e. any punctuation not marked in _punct_type). for node in root.descendants: - if node.upos == "PUNCT" and not self._punct_type[node.ord]: + if node.upos == 'PUNCT' and not self._punct_type[node.ord]: self._fix_subord_punct(node) - # UD requires "exactly one word is the head of the sentence, dependent on a notional ROOT", i.e. a single "subroot". # This seems to be a stronger rule than no-PUNCT-children because it is checked by the validator. # So lets prevent multiple subroots (at the cost of possibly re-introducing PUNCT-children). @@ -100,6 +111,8 @@ def process_tree(self, root): # This may not hold if the original subroot was a paired punctuation, which was rehanged. if root.children[0].udeprel != 'root': root.children[0].udeprel = 'root' + if self.copy_to_enhanced: + root.children[0].deps = [{'parent': root, 'deprel': 'root'}] for another_node in root.children[0].descendants: if another_node.udeprel == 'root': another_node.udeprel = 'punct' @@ -107,8 +120,8 @@ def process_tree(self, root): # TODO: This block changes parents not only for PUNCT nodes. These should be reflected into enhanced deps as well. if self.copy_to_enhanced: for node in root.descendants: - if node.upos == "PUNCT": - node.deps = [{'parent': node.parent, 'deprel': 'punct'}] + if node.upos == 'PUNCT': + node.deps = [{'parent': node.parent, 'deprel': node.deprel}] def _fix_subord_punct(self, node): # Dot used as the ordinal-number marker (in some languages) or abbreviation marker. @@ -131,12 +144,12 @@ def _fix_subord_punct(self, node): l_cand, r_cand = node.prev_node, node.next_node if node.form in FINAL_PUNCT: r_cand = None - while l_cand.ord > 0 and l_cand.upos == "PUNCT": + while l_cand.ord > 0 and l_cand.upos == 'PUNCT': if self._punct_type[l_cand.ord] == 'opening' and l_cand.parent != node: l_cand = None break l_cand = l_cand.prev_node - while r_cand is not None and r_cand.upos == "PUNCT": + while r_cand is not None and r_cand.upos == 'PUNCT': if self._punct_type[r_cand.ord] == 'closing' and r_cand.parent != node: r_cand = None break @@ -149,13 +162,13 @@ def _fix_subord_punct(self, node): if l_cand is None or l_cand.is_root(): l_cand, l_path = None, [] else: - while (not l_cand.parent.is_root() and l_cand.parent.precedes(node) - and not node.precedes(l_cand.descendants(add_self=1)[-1])): + while (not l_cand.parent.is_root() and l_cand.parent < node + and not node < l_cand.descendants(add_self=1)[-1]): l_cand = l_cand.parent l_path.append(l_cand) if r_cand is not None: - while (not r_cand.parent.is_root() and node.precedes(r_cand.parent) - and not r_cand.descendants(add_self=1)[0].precedes(node)): + while (not r_cand.parent.is_root() and node < r_cand.parent + and not r_cand.descendants(add_self=1)[0] < node): r_cand = r_cand.parent r_path.append(r_cand) @@ -193,7 +206,7 @@ def _fix_subord_punct(self, node): # We try to be conservative and keep the parent, unless we are sure it is wrong. if node.parent not in path: node.parent = cand - node.deprel = "punct" + node.deprel = 'punct' def _will_be_projective(self, node, cand): node.parent = cand @@ -204,9 +217,8 @@ def _causes_gap(self, node): def _fix_paired_punct(self, root, opening_node, closing_punct): if (self.check_paired_punct_upos - or opening_node.form == "'") and opening_node.upos != 'PUNCT': + or opening_node.form in "'\"") and opening_node.upos != 'PUNCT': return - nested_level = 0 for node in root.descendants[opening_node.ord:]: if node.form == closing_punct: @@ -219,29 +231,57 @@ def _fix_paired_punct(self, root, opening_node, closing_punct): nested_level += 1 def _fix_pair(self, root, opening_node, closing_node): + # Ideally, paired punctuation symbols should be attached to the single + # head of the subtree inside. Provided the inside segment is a single + # subtree. heads = [] punct_heads = [] - for node in root.descendants[opening_node.ord: closing_node.ord - 1]: - if node.parent.precedes(opening_node) or closing_node.precedes(node.parent): - if node.upos == 'PUNCT': - punct_heads.append(node) + for node in root.descendants: + if node == opening_node or node == closing_node: + continue + # If this is a node inside of the pair, is its parent outside? + if node > opening_node and node < closing_node: + if node.parent < opening_node or node.parent > closing_node: + if node.upos == 'PUNCT': + punct_heads.append(node) + else: + heads.append(node) + # Not only the punctuation symbols must not be attached non-projectively, + # they also must not cause non-projectivity of other relations. This could + # happen if an outside node is attached to an inside node. To account for + # this, mark the inside parent as a head, too. + elif node.parent > opening_node and node.parent < closing_node: + if node.parent.upos == 'PUNCT': + punct_heads.append(node.parent) else: - heads.append(node) + heads.append(node.parent) # Punctuation should not have children, but if there is no other head candidate, # let's break this rule. if len(heads) == 0: heads = punct_heads - if len(heads) == 1: + # If there are no nodes between the opening and closing mark (), + # let's treat the marks as any other (non-pair) punctuation. + if len(heads) == 0: + return + else: + # Ideally, there should be only a single head. + # If not, we could try e.g. to choose the "widests-span head": + # opening_node.parent = sorted(heads, key=lambda n: n.descendants(add_self=1)[0].ord)[0] + # closing_node.parent = sorted(heads, key=lambda n: -n.descendants(add_self=1)[-1].ord)[0] + # which often leads to selecting the same head for the opening and closing punctuation + # ignoring single words inside the paired punct which are non-projectively attached outside. + # However, this means that the paired punctuation will be attached non-projectively, + # which is forbidden by the UD guidelines. + # Thus, we will choose the nearest head, which is the only way how to prevent non-projectivities. + # Sort the heads by their ords (this is not guaranteed because we were adding a mixture of + # inside heads and inside parents of outside nodes). + heads.sort(key=lambda x: x.ord) opening_node.parent = heads[0] - closing_node.parent = heads[0] - self._punct_type[opening_node.ord] = 'opening' - self._punct_type[closing_node.ord] = 'closing' - elif len(heads) > 1: - opening_node.parent = sorted(heads, key=lambda n: n.descendants(add_self=1)[0].ord)[0] - closing_node.parent = sorted(heads, key=lambda n: -n.descendants(add_self=1)[-1].ord)[0] - self._punct_type[opening_node.ord] = 'opening' - self._punct_type[closing_node.ord] = 'closing' + closing_node.parent = heads[-1] + + self._punct_type[opening_node.ord] = 'opening' + self._punct_type[closing_node.ord] = 'closing' # In rare cases, non-projective gaps may remain. Let's dirty fix these! # E.g. in "the (lack of) reproducibility", the closing parenthesis diff --git a/udapi/block/ud/fixroot.py b/udapi/block/ud/fixroot.py new file mode 100644 index 00000000..be972d8b --- /dev/null +++ b/udapi/block/ud/fixroot.py @@ -0,0 +1,37 @@ +""" +Block ud.FixRoot will ensure that the tree is free of common root-related errors. +Simple heuristics are used; it is likely that human inspection would lead to +a different solution. Nevertheless, if a quick fix is needed to pass the +validation, this block can be helpful. + +WARNING: The block currently ignores enhanced dependencies. +""" +import re +from udapi.core.block import Block + + +class FixRoot(Block): + """ + Fixes the following validation errors: + - Only one node must be attached directly to the artificial root node. + => If the root has multiple children, keep the first one. Attach the other + ones to the first one. Change their deprel to 'parataxis'. + - The node attached as a child of the artificial root node must have the + 'root' relation (or its subtype). + => If the root child has another deprel, change it to 'root'. + - The node attached as a child of the artificial root node is the only one + allowed to have the 'root' relation (or its subtype). + => If another node has that deprel, change it to 'parataxis'. + """ + + def process_tree(self, root): + rchildren = root.children + if len(rchildren) > 1: + for i in range(len(rchildren)-1): + rchildren[i+1].parent = rchildren[0] + rchildren[i+1].deprel = 'parataxis' + if rchildren[0].udeprel != 'root': + rchildren[0].deprel = 'root' + for n in root.descendants: + if not n.parent == root and n.udeprel == 'root': + n.deprel = 'parataxis' diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py new file mode 100644 index 00000000..004ab4af --- /dev/null +++ b/udapi/block/ud/hi/fixaux.py @@ -0,0 +1,170 @@ +""" +Block to fix annotation of verbs that are currently treated as auxiliaries +but they should be treated as normal verbs instead. +""" +from udapi.core.block import Block +import logging +import re + +class FixAux(Block): + + def process_node(self, node): + self.fix_lemma(node) + # The following verbs appear in verb-verb compounds as the semantically + # less salient element: le (to take), de (to give), ḍāla / phenka (to throw), + # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), + # pahuñca (to reach), dekha (to look), phara (to return), cala (to walk), + # caṛha (to climb), saṛa (to rot), nikala (to get out), nikāla (to remove), girā (to drop), + # samā (to encounter), dhamaka (to bully), khaḍā (to stand), daboca (to catch), + # gujara (to pass), ghera (to surround), baca (to escape). + # There are also jā (to go) and paṛa (to fall) but we do not list them here + # because they can also act as genuine auxiliaries. + hicompound = ['ले', 'दे', 'डाल', 'फेंक', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकल', 'निकाल', 'गिरा', 'समा', 'धमक', 'खडा', 'दबोच', 'गुजर', 'फूंक', 'घेर', 'बच'] + urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] + recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' + # Control and raising verbs. + # चाहना چاہنا (cāhnā) “to want, to wish” is a control verb but not an auxiliary. + # Its form چاہیئے (cāhie) “should, ought to” (literally "is wanted"?) is treated as a separate, derived word, and it is a modal auxiliary. + # दिखाना دکھانا (dikhānā) “to show” + # बनना بننا (bananā) “to become” + hiphase = ['लग', 'चुक', 'चाह', 'दिखा', 'बन', 'करा'] + urphase = ['لگ', 'چک', 'چاہ', 'دکھا', 'بن'] + rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' + if re.match(recompound, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': + node.deprel = 'compound' + # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX. + node.upos = "VERB" + # वाला والا (vālā) with infinitive is annotated as auxiliary but it should not. + # It is not even a verb (it does not have a verbal paradigm); it is more + # like an adjective morphologically, and like a noun syntactically. It means + # “the one who does the action of the content verb infinitive.” + # Some occurrences in the original annotation are case or mark, so we do not + # check AUX/aux here. + elif node.lemma == 'वाला' or node.lemma == 'والا': + node.upos = 'ADJ' + node.feats['AdpType'] = '' + node.feats['VerbForm'] = '' + node.feats['Aspect'] = '' + node.deprel = 'compound' + elif re.match(rephase, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': + secpred = node.parent + grandparent = secpred.parent + node.parent = grandparent + node.deprel = secpred.deprel + secpred.parent = node + secpred.deprel = "xcomp" + ###!!! We should also take care of DEPS if they exist. + # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX. + node.upos = "VERB" + # Examine the children of the original parent. + # Those that modify the clause should be re-attached to me. + # Those that modify the word (noun, adjective) should stay there. + for c in secpred.children: + # obl is borderline. It could modify an adjective rather than a clause. + # obj and iobj should not occur in copular clauses but it sometimes + # occurs with pseudocopulas: "I declare him handsome." + if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): + c.parent = node + + def fix_lemma(self, node): + """ + Some verbal forms have wrong lemmas in the Hindi/Urdu treebanks. If they + are tagged AUX, it means that either the validator fails to recognize a + correct auxiliary, or we fail here to recognize a spurious auxiliary that + must be fixed. + """ + if node.upos == 'AUX': + # آنے is the oblique infinitive form of “to come” + if node.lemma == 'آنہ': + node.lemma = 'آ' + # بنانا बनाना “make, create, produce, cause to be/become” + # (I don't know why in some instances بنا was used as lemma for کر “to do”.) + if node.form == 'کر' and node.lemma == 'بنا': + node.lemma = 'کر' + # چاہئے (cāhie) “should, ought to” occurs with alternative spellings (should they also be labeled as typos?) + if node.form == 'چاہئے' or node.form == 'چاہیئے' or node.form == 'چاہیے': + node.lemma = 'چاہئے' + if node.form == 'چاہئیں': + node.lemma = 'چاہئے' + node.feats['Number'] = 'Plur' + # چاہے seems to be a wrong lemma of چاہیں_گے “would like” + if node.lemma == 'چاہے': + node.lemma = 'چاہ' + # चुका چکا is a perfective participle of चुकना چکنا (cuknā) “to be finished” + if node.lemma == 'चुका': + node.lemma = 'चुक' + if node.lemma == 'چکا': + node.lemma = 'چک' + # दिया دیا is a perfective participle of देना دینا (denā) “to give” + if node.lemma == 'दिया': + node.lemma = 'दे' + if node.lemma == 'دیا' or node.lemma == 'دی' or node.lemma == 'دیت': + node.lemma = 'دے' + # دکھائیں (dikhānā) “to show” + if node.form == 'دکھائیں': + node.lemma = 'دکھا' + # گا, گی, گے denote the future tense. They are written as separate + # words in Urdu (while they are just suffixes in Hindi). However, + # when written as a separate auxiliary, all these forms should share + # the same lemma. + if node.lemma == 'گی' or node.lemma == 'گے': + node.lemma = 'گا' + # گیا is a perfective participle of जाना جانا‎ (jānā) “to go” + # जान جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. + if node.lemma == 'जाना' or node.lemma == 'जान': + node.lemma = 'जा' + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ' or node.lemma == 'جائی' or node.lemma == 'جاتے' or node.lemma == 'جات': + node.lemma = 'جا' + # Wrongly lemmatized present forms of “to be”. + # In one instance, ہے had a lemma from a neighboring verb, so we also look at the form. + if node.lemma == 'हों' or node.lemma == 'है.': + node.lemma = 'है' + if node.lemma == 'ہوں' or node.lemma == 'ہوا' or node.form == 'ہے': + node.lemma = 'ہے' + # लिया لیا is a perfective participle of लेना لینا (lenā) “to take” + # In one instance, لیا had a lemma from a neighboring verb, so we also look at the form. + if node.lemma == 'लिया': + node.lemma = 'ले' + if node.lemma == 'لیا' or node.form == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی' or node.lemma == 'لیجیے': + node.lemma = 'لے' + # लगा لگا is a perfective participle of लगना لگنا (lagnā) “to seem, to appear” + if node.lemma == 'लगा': + node.lemma = 'लग' + if node.lemma == 'لگا': + node.lemma = 'لگ' + # पहुंचा پہنچا is a perfective participle of पहुंचना پہنچنا (pahuñcnā) “to reach” + if node.lemma == 'पहुंचा' or node.lemma == 'पहुँच': + node.lemma = 'पहुंच' + # پڑے is a perfective participle of پڑنا (paṛnā) “to fall” + if node.lemma == 'پڑے': + node.lemma = 'پڑ' + # پھرے is a perfective participle of پھرنا (pharnā) “to return” + if node.lemma == 'پھرے': + node.lemma = 'پھر' + # रहा رہا is a perfective participle of रहना رہنا (rahnā) “to stay” + if node.lemma == 'रहा' or node.lemma == 'रहूं': + node.lemma = 'रह' + if node.lemma == 'رہا' or node.lemma == 'رہی' or node.lemma == 'رہے': + node.lemma = 'رہ' + # sakna to be able to + if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا' or node.lemma == 'سکت': + node.lemma = 'سک' + # Wrongly lemmatized past forms of “to be”. + if node.lemma == 'थी': + node.lemma = 'था' + if node.lemma == 'تھ' or node.lemma == 'تھے' or node.lemma == 'تھیں': + node.lemma = 'تھا' + # उठा اٹھا is a perfective participle of उठना اٹھنا (uṭhnā) “to rise, get up” + if node.lemma == 'उठा': + node.lemma = 'उठ' + if node.lemma == 'اٹھا': + node.lemma = 'اٹھ' + # The compound part vālā is not an auxiliary. We handle it in process_node() + # but it must be lemmatized properly. + if node.lemma == 'والی': + node.lemma = 'والا' + # The postposition ke after a verbal stem is not an auxiliary. + # Example: علحدہ علحدہ کیس رجسٹر کر کے “by registering separate cases” + if node.lemma == 'کا' and node.form == 'کے': + node.upos = 'ADP' + node.deprel = 'mark' diff --git a/udapi/block/ud/id/__init__.py b/udapi/block/ud/id/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py new file mode 100644 index 00000000..a8d50748 --- /dev/null +++ b/udapi/block/ud/id/addmwt.py @@ -0,0 +1,219 @@ +""" +Block ud.id.AddMwt cuts the clitic "-nya" in Indonesian (preprocessed with +MorphInd whose output is stored in MISC attribute MorphInd). +""" +import udapi.block.ud.addmwt +import logging +import re + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + if re.search(r'^(ku|kau)', node.form, re.IGNORECASE) and re.search(r'^\^(aku

_PS1|kamu

_PS2)\+', node.misc['MorphInd']) and node.upos == 'VERB': + splitform = re.sub(r'^(ku|kau)', r'\1 ', node.form, flags=re.IGNORECASE) + # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + upos = 'PRON VERB' + if re.search(r'^ku ', splitform.lower()): + lemma = re.sub(r'^ku ', 'aku ', splitform.lower()) + feats = 'Number=Sing|Person=1|PronType=Prs *' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split())<2: + xpos = 'PS1 VSA' + else: + lemma = re.sub(r'^kau ', 'kamu ', splitform.lower()) + feats = 'Number=Sing|Person=2|PronType=Prs *' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split())<2: + xpos = 'PS2 VSA' + deprel = 'nsubj *' + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'main': 1, 'shape': 'subtree', 'deprel': deprel} + elif re.search(r'(nya|ku|mu)$', node.form, re.IGNORECASE) and re.search(r'\+(dia

_PS3|aku

_PS1|kamu

_PS2)\$$', node.misc['MorphInd']): + if node.upos == 'VERB': + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) + # For transitive verbs with the meN- prefix, -nya is an object clitic. + # For passive verbs with the di- prefix, -nya refers to a passive agent. + # For verbs with prefixes ber-, ter-, and verbs without prefixes, -nya is a definite article and signals nominalization. + # The same would hold for intransitive verbs with the meN- prefix but we cannot recognize them (we will treat all meN- verbs as transitive). + menverb = True if re.match(r'^\^meN\+', node.misc['MorphInd']) else False + diverb = True if re.match(r'^\^di\+', node.misc['MorphInd']) else False + nominalization = not menverb and not diverb + # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + if nominalization: + lemma = splitform.lower() + upos = 'VERB DET' + feats = '* Definite=Def|PronType=Art' + deprel = '* det' + else: + upos = 'VERB PRON' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' + # The agent of the passive verb is coded like a direct object of an active verb, + # so we might want to use obj:agent rather than obl:agent. However, full nominals + # as passive agents can be optionally accompanied by the preposition _oleh_ "by", + # which is an argument in favor of saying that they are oblique. So we currently + # mark all passive agents as obliques, although it is disputable in Austronesian + # languages (unlike Indo-European passives). + deprel = '* obl:agent' if diverb else '* obj' + xpos = re.sub(r'\+', ' ', node.xpos) + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'(NOUN|PROPN|X)', node.upos): + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) + # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + upos = '* PRON' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* nmod:poss' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'PRON' and re.match(r'^diri(nya|ku|mu)$', node.form, re.IGNORECASE): + # dirinya = reflexive himself/herself/itself (similarly, diriku = myself, dirimu = yourself; somewhere else we should check that they have the right features) + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) + # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + upos = 'PRON PRON' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=3|PronType=Prs' + xpos = 'NSD PS3' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=1|PronType=Prs' + xpos = 'NSD PS1' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=2|PronType=Prs' + xpos = 'NSD PS2' + deprel = '* nmod:poss' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'ADJ' and re.search(r'(nya)$', node.form, re.IGNORECASE): + # nominalized adjective + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'ADJ DET' + feats = '* Definite=Def|PronType=Art' + if re.match(r' ', node.xpos): + xpos = re.sub(r'\+', ' ', node.xpos) + else: + xpos = 'ASP PS3' + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'^(banyak|semua)nya$', node.form, re.IGNORECASE): + # semua = all (DET) + # semuanya = nominalization of semua, i.e., 'everything' (PRON) + # banyak = many, much (DET) + # banyaknya = nominalization of banyak, i.e., 'a lot' (PRON) + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'DET DET' + feats = ('PronType=Tot' if lemma == 'semua nya' else 'PronType=Ind')+' Definite=Def|PronType=Art' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'^(satu)nya$', node.form, re.IGNORECASE): + # satu = one (NUM) + # satunya = nominalization of satu, meaning 'the only one' + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'NUM DET' + feats = 'NumType=Card Definite=Def|PronType=Art' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'ADP' and re.match(r'^R--\+PS[123]$', node.xpos) or re.match(r'^(bersama|dibawah|didalam|sekitar)nya$', node.form, re.IGNORECASE): + # Fused preposition and pronoun. + # Most of them are recognized as R--+PS3 by MorphInd. However, some are different: + # bersamanya = 'with him' = VSA+PS3 + # dibawahnya = 'under it' = VSP+PS3 + # didalamnya = 'inside it' = VSP+PS3 + # sekitarnya = 'around it' = D--+PS3 + # However: + # layaknya = 'like' is a derivation from 'layak' = 'worthy' (ASP+PS3) + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) + upos = 'ADP PRON' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + xpos = 'R-- PS3' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + xpos = 'R-- PS1' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' + xpos = 'R-- PS2' + if node.udeprel == 'case': + if re.match(r'^(NOUN|PROPN|PRON|DET|NUM|X|SYM)$', node.parent.upos): + deprel = 'nmod' + else: + deprel = 'obl' + else: + deprel = '*' + deprel = 'case '+deprel + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'main': 1, 'shape': 'subtree', 'deprel': deprel} + else: + # Do not warn about instances that are known exceptions. + # akibatnya = as a result (SCONJ); akibat = result + # bukannya = instead (PART); bukan = no, not + # layaknya = like (ADP); layak = worthy + # sebaiknya = should (AUX) + # sesampainya = once in / arriving at (ADP) + # tidaknya = whether or not (PART); tidak = no, not + # Adverbs are an exception, too. The -nya morpheme could be derivation. E.g., 'ironis' = 'ironic'; 'ironisnya' = 'ironically'. + if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|layak|sebaik|sesampai|tidak)(nya|ku|mu)$', node.form, re.IGNORECASE): + logging.warning("Form '%s' analyzed by MorphInd as having the -nya|-ku|-mu clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos)) + return None + elif re.search(r'(kah|lah|pun|tah)$', node.form, re.IGNORECASE) and re.search(r'\+(kah|lah|pun|tah)_T--\$$', node.misc['MorphInd']): + splitform = re.sub(r'(kah|lah|pun|tah)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = '* PART' + feats = '* _' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split()) < 2: + xpos = xpos + ' T--' + deprel = '* advmod:emph' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + return None + + def postprocess_mwt(self, mwt): + """Distribute the MorphInd analysis to the two parts so that we can later use it to fix the lemmas of verbs.""" + match = re.match(r'^\^(.*)\+(aku

_PS1|kamu

_PS2|dia

_PS3|kah_T--|lah_T--|pun_T--|tah_T--)\$$', mwt.misc['MorphInd']) + if not match: + match = re.match(r'^\^(aku

_PS1|kamu

_PS2)\+(.*)\$$', mwt.misc['MorphInd']) + if match: + mwt.words[0].misc['MorphInd'] = '^'+match.group(1)+'$' + mwt.words[1].misc['MorphInd'] = '^'+match.group(2)+'$' diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py new file mode 100644 index 00000000..d328212d --- /dev/null +++ b/udapi/block/ud/id/fixgsd.py @@ -0,0 +1,447 @@ +"""Block to fix annotation of UD Indonesian-GSD.""" +from udapi.core.block import Block +import logging +import re + +class FixGSD(Block): + + def fix_upos_based_on_morphind(self, node): + """ + Example from data: ("kesamaan"), the correct UPOS is NOUN, as + suggested by MorphInd. + Based on my observation so far, if there is a different UPOS between + the original GSD and MorphInd, it's better to trust MorphInd + I found so many incorrect UPOS in GSD, especially when NOUNs become + VERBs and VERBs become NOUNs. + I suggest adding Voice=Pass when the script decides ke-xxx-an as VERB. + """ + if node.upos == 'VERB' and node.xpos == 'NSD' and re.match(r'^ke.+an$', node.form, re.IGNORECASE): + node.upos = 'NOUN' + if node.udeprel == 'acl': + node.deprel = 'nmod' + elif node.udeprel == 'advcl': + node.deprel = 'obl' + + def fix_semua(self, node): + """ + Indonesian "semua" means "everything, all". + Originally it was DET, PRON, or ADV. + Ika: I usually only labeled "semua" as DET only if it's followed by a + NOUN/PROPN. If it's followed by DET (including '-nya' as DET) or it's + not followed by any NOUN/DET, I labeled them as PRON. + """ + if node.form.lower() == 'semua': + if re.match(r'^(NOUN|PROPN)$', node.parent.upos) and node.parent.ord > node.ord: + node.upos = 'DET' + if node.udeprel == 'nmod' or node.udeprel == 'advmod': + node.deprel = 'det' + else: + node.upos = 'PRON' + if node.udeprel == 'det' or node.udeprel == 'advmod': + node.deprel = 'nmod' + node.feats['PronType'] = 'Tot' + + def fix_ordinal_numerals(self, node): + """ + Ordinal numerals should be ADJ NumType=Ord in UD. They have many different + UPOS tags in Indonesian GSD. This method harmonizes them. + pertama = first + kedua = second + ketiga = third + keempat = fourth + kelima = fifth + keenam = sixth + ketujuh = seventh + kedelapan = eighth + kesembilan = ninth + ke-48 = 48th + + However! The ke- forms (i.e., not 'pertama') can also function as total + versions of cardinal numbers ('both', 'all three' etc.). If the numeral + precedes the noun, it is a total cardinal; if it follows the noun, it is + an ordinal. An exception is when the modified noun is 'kali' = 'time'. + Then the numeral is ordinal regardless where it occurs, and together + with 'kali' it functions as an adverbial ordinal ('for the second time'). + """ + # We could also check the XPOS, which is derived from MorphInd: re.match(r'^CO-', node.xpos) + if re.match(r'^pertama(nya)?$', node.form, re.IGNORECASE): + node.upos = 'ADJ' + node.feats['NumType'] = 'Ord' + if re.match(r'^(det|nummod|nmod)$', node.udeprel): + node.deprel = 'amod' + elif re.match(r'^(kedua|ketiga|keempat|kelima|keenam|ketujuh|kedelapan|kesembilan|ke-?\d+)(nya)?$', node.form, re.IGNORECASE): + if node.parent.ord < node.ord or node.parent.lemma == 'kali': + node.upos = 'ADJ' + node.feats['NumType'] = 'Ord' + if re.match(r'^(det|nummod|nmod)$', node.udeprel): + node.deprel = 'amod' + else: + node.upos = 'NUM' + node.feats['NumType'] = 'Card' + node.feats['PronType'] = 'Tot' + if re.match(r'^(det|amod|nmod)$', node.udeprel): + node.deprel = 'nummod' + + def rejoin_ordinal_numerals(self, node): + """ + If an ordinal numeral is spelled using digits ('ke-18'), it is often + tokenized as multiple tokens, which is wrong. Fix it. + """ + if node.form.lower() == 'ke': + dash = None + number = None + if node.next_node: + if node.next_node.form == '-': + dash = node.next_node + if dash.next_node and re.match(r'^\d+$', dash.next_node.form): + number = dash.next_node + node.form = node.form + dash.form + number.form + node.lemma = node.lemma + dash.lemma + number.lemma + elif re.match(r'^\d+$', node.next_node.form) and (node.parent == node.next_node or node.next_node.parent == node): + number = node.next_node + node.feats['Typo'] = 'Yes' + node.misc['CorrectForm'] = node.form + '-' + number.form + node.form = node.form + number.form + node.lemma = node.lemma + '-' + number.lemma + if number: + # Let us pretend that these forms are always ordinal numerals. + # Situations where they act as total cardinals will be disambiguated + # in a subsequent call to fix_ordinal_numerals(). + node.upos = 'ADJ' + node.xpos = 'CO-' + node.feats['NumType'] = 'Ord' + node.misc['MorphInd'] = '^ke_R--+' + number.form + '_CC-$' + # Find the parent node. Assume that the dash, if present, was not the head. + if node.parent == number: + node.parent = number.parent + node.deprel = number.deprel + if re.match(r'(case|mark|det|nummod|nmod)', node.udeprel): + node.deprel = 'amod' + # Adjust SpaceAfter. + node.misc['SpaceAfter'] = 'No' if number.no_space_after else '' + # Remove the separate node of the dash and the number. + if dash: + if len(dash.children) > 0: + for c in dash.children: + c.parent = node + dash.remove() + if len(number.children) > 0: + for c in number.children: + c.parent = node + number.remove() + # There may have been spaces around the dash, which are now gone. Recompute the sentence text. + node.root.text = node.root.compute_text() + + def rejoin_decades(self, node): + """ + In Indonesian, the equivalent of English "1990s" is written as "1990-an". + In GSD, it is often tokenized as multiple tokens, which is wrong. Fix it. + """ + if node.form.lower() == 'an': + dash = None + number = None + if node.prev_node: + if node.prev_node.form == '-': + dash = node.prev_node + if dash.prev_node and re.match(r'^\d+$', dash.prev_node.form): + number = dash.prev_node + node.form = number.form + dash.form + node.form + node.lemma = number.lemma + dash.lemma + node.lemma + elif re.match(r'^\d+$', node.prev_node.form) and (node.parent == node.prev_node or node.prev_node.parent == node): + number = node.prev_node + node.feats['Typo'] = 'Yes' + node.misc['CorrectForm'] = number.form + '-' + node.form + node.form = number.form + node.form + node.lemma = number.lemma + '-' + node.lemma + if number: + # The combined token is no longer a numeral. It cannot quantify an entity. + # Instead, it is itself something like a noun (or perhaps proper noun). + node.upos = 'NOUN' + node.xpos = 'NSD' + node.feats['NumType'] = '' + # In some cases, "-an" is labeled as foreign for no obvious reason. + node.feats['Foreign'] = '' + node.misc['MorphInd'] = '^' + number.form + '_CC-+an_F--$' + # Find the parent node. Assume that the dash, if present, was not the head. + if node.parent == number: + node.parent = number.parent + node.deprel = number.deprel + if re.match(r'(case|mark|det|nummod|nmod)', node.udeprel): + node.deprel = 'nmod' + # No need to adjust SpaceAfter, as the 'an' node was the last one in the complex. + #node.misc['SpaceAfter'] = 'No' if number.no_space_after else '' + # Remove the separate node of the dash and the number. + if dash: + if len(dash.children) > 0: + for c in dash.children: + c.parent = node + dash.remove() + if len(number.children) > 0: + for c in number.children: + c.parent = node + number.remove() + # There may have been spaces around the dash, which are now gone. Recompute the sentence text. + node.root.text = node.root.compute_text() + + def merge_reduplication(self, node): + """ + Reduplication is a common morphological device in Indonesian. Reduplicated + nouns signal plural but some reduplications also encode emphasis, modification + of meaning etc. In the previous annotation of GSD, reduplication was mostly + analyzed as three tokens, e.g., for plurals, the second copy would be attached + to the first one as compound:plur, and the hyphen would be attached to the + second copy as punct. We want to analyze reduplication as a single token. + Fix it. + """ + # We assume that the previous token is a hyphen and the token before it is the parent. + first = node.parent + root = node.root + # Example of identical reduplication: negara-negara = countries + # Example of reduplication with -an: kopi-kopian = various coffee trees + # Example of reduplication with vowel substitution: bolak-balik = alternating + # Example of reduplication with di-: disebut-sebut = mentioned (the verb sebut is reduplicated, then passivized) + # Example of reduplication with se-: sehari-hari = daily (hari = day) + # The last pattern is not reduplication but we handle it here because the procedure is very similar: non-/sub-/anti- + a word. + if first.ord == node.ord-2 and (first.form.lower() == node.form.lower() or first.form.lower() + 'an' == node.form.lower() or re.match(r'^(.)o(.)a(.)-\1a\2i\3$', first.form.lower() + '-' + node.form.lower()) or first.form.lower() == 'di' + node.form.lower() or first.form.lower() == 'se' + node.form.lower() or re.match(r'^(non|sub|anti|multi|kontra)$', first.form.lower())): + hyph = node.prev_node + if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): + # This is specific to the reduplicated plurals. The rest will be done for any reduplications. + # Note that not all reduplicated plurals had compound:plur. So we will look at whether they are NOUN. + ###!!! Also, reduplicated plural nouns always have exact copies on both sides of the hyphen. + ###!!! Some other reduplications have slight modifications on one or the other side. + if node.upos == 'NOUN' and first.form.lower() == node.form.lower(): + first.feats['Number'] = 'Plur' + # For the non-/sub-/anti- prefix we want to take the morphology from the second word. + if re.match(r'^(non|sub|anti|multi|kontra)$', first.form.lower()): + first.lemma = first.lemma + '-' + node.lemma + first.upos = node.upos + first.xpos = node.xpos + first.feats = node.feats + first.misc['MorphInd'] = re.sub(r'\$\+\^', '+', first.misc['MorphInd'] + '+' + node.misc['MorphInd']) + # Neither the hyphen nor the current node should have children. + # If they do, re-attach the children to the first node. + for c in hyph.children: + c.parent = first + for c in node.children: + c.parent = first + # Merge the three nodes. + # It is possible that the last token of the original annotation + # is included in a multi-word token. Then we must extend the + # multi-word token to the whole reduplication! Example: + # pemeran-pemerannya (the actors) ... originally 'pemeran' and '-' + # are tokens, 'pemerannya' is a MWT split to 'pemeran' and 'nya'. + mwt = node.multiword_token + if mwt: + # We assume that the MWT has only two words. We are not prepared for other possibilities. + if len(mwt.words) > 2: + logging.critical('MWT of only two words is expected') + mwtmisc = mwt.misc.copy() + second = mwt.words[1] + mwt.remove() + first.form = first.form + '-' + node.form + hyph.remove() + node.remove() + first.misc['SpaceAfter'] = '' + mwt = root.create_multiword_token([first, second], first.form + second.form, mwtmisc) + else: + first.form = first.form + '-' + node.form + if node.no_space_after: + first.misc['SpaceAfter'] = 'No' + else: + first.misc['SpaceAfter'] = '' + hyph.remove() + node.remove() + # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. + # If it did not, then we have a mismatch with the sentence text, which we must fix. + # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). + root.text = root.compute_text() + # In some cases the non-/sub-/anti- prefix is annotated as the head of the phrase and the above pattern does not catch it. + elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti|multi|kontra)$', node.form.lower()): + prefix = node + stem = first # here it is not the first part at all + hyph = stem.prev_node + if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): + # For the non-/sub-/anti- prefix we want to take the morphology from the second word. + stem.lemma = prefix.lemma + '-' + stem.lemma + stem.misc['MorphInd'] = re.sub(r'\$\+\^', '+', prefix.misc['MorphInd'] + '+' + stem.misc['MorphInd']) + # Neither the hyphen nor the prefix should have children. + # If they do, re-attach the children to the stem. + for c in hyph.children: + c.parent = stem + for c in prefix.children: + c.parent = stem + # Merge the three nodes. + # It is possible that the last token of the original annotation + # is included in a multi-word token. Then we must extend the + # multi-word token to the whole reduplication! Example: + # pemeran-pemerannya (the actors) ... originally 'pemeran' and '-' + # are tokens, 'pemerannya' is a MWT split to 'pemeran' and 'nya'. + mwt = stem.multiword_token + if mwt: + # We assume that the MWT has only two words. We are not prepared for other possibilities. + if len(mwt.words) > 2: + logging.critical('MWT of only two words is expected') + mwtmisc = mwt.misc.copy() + second = mwt.words[1] + mwt.remove() + stem.form = prefix.form + '-' + stem.form + prefix.remove() + hyph.remove() + stem.misc['SpaceAfter'] = '' + mwt = root.create_multiword_token([stem, second], stem.form + second.form, mwtmisc) + else: + stem.form = prefix.form + '-' + stem.form + prefix.remove() + hyph.remove() + # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. + # If it did not, then we have a mismatch with the sentence text, which we must fix. + # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). + root.text = root.compute_text() + + def fix_plural_propn(self, node): + """ + It is unlikely that a proper noun will have a plural form in Indonesian. + All examples observed in GSD should actually be tagged as common nouns. + """ + if node.upos == 'PROPN' and node.feats['Number'] == 'Plur': + node.upos = 'NOUN' + node.lemma = node.lemma.lower() + if node.upos == 'PROPN': + node.feats['Number'] = '' + + def fix_satu_satunya(self, node): + """ + 'satu' = 'one' (NUM) + 'satu-satunya' = 'the only' + """ + root = node.root + if node.form == 'nya' and node.parent.form.lower() == 'satu' and node.parent.udeprel == 'fixed' and node.parent.parent.form.lower() == 'satu': + satu0 = node.parent.parent + satu1 = node.parent + nya = node + dash = None + if satu1.ord == satu0.ord+2 and satu1.prev_node.form == '-': + dash = satu1.prev_node + satu0.misc['SpaceAfter'] = 'No' + dash.misc['SpaceAfter'] = 'No' + root.text = root.compute_text() + satu1.deprel = 'compound:redup' + nya.parent = satu0 + # We actually cannot leave the 'compound:redup' here because it is not used in Indonesian. + if node.form == 'nya' and node.parent.form.lower() == 'satu': + satu0 = node.parent + nya = node + if satu0.next_node.form == '-': + dash = satu0.next_node + if dash.next_node.form.lower() == 'satu': + satu1 = dash.next_node + if satu1.ord == node.ord-1: + # Merge satu0 + dash + satu1 into one node. + satu0.form = satu0.form + dash.form + satu1.form + dash.remove() + satu1.remove() + # There should be a multi-word token comprising satu1 + nya. + mwt = nya.multiword_token + if mwt: + mwtmisc = mwt.misc.copy() + mwt.remove() + mwt = root.create_multiword_token([satu0, nya], satu0.form + nya.form, mwtmisc) + satu0.misc['SpaceAfter'] = '' + root.text = root.compute_text() + if node.multiword_token and node.no_space_after: + node.misc['SpaceAfter'] = '' + + def lemmatize_from_morphind(self, node): + # The MISC column contains the output of MorphInd for the current word. + # The analysis has been interpreted wrongly for some verbs, so we need + # to re-interpret it and extract the correct lemma. + morphind = node.misc['MorphInd'] + if node.upos == 'VERB': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r"_V[SP][AP]$", "", morphind) + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r"\+", morphind) + # Expected suffixes are -kan, -i, -an, or no suffix at all. + # There is also the circumfix ke-...-an which seems to be nominalized adjective: + # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama"; + # but I am not sure what is the reason that these are tagged VERB. + if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]): + del morphemes[-1] + # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. + # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". + while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma) + node.lemma = lemma + else: + logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) + elif node.upos == 'NOUN': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r'_(N[SP]D|VSA)$', '', morphind) + # Do not proceed if there is an unexpected final XPOS tag. + if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r'\+', morphind) + # Expected prefixes are peN-, per-, ke-, ber-. + # Expected suffix is -an. + if len(morphemes) > 1 and re.match(r'^an$', morphemes[-1]): + del morphemes[-1] + if len(morphemes) > 1 and re.match(r'^(peN|per|ke|ber)$', morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r'<[a-z]+>', '', lemma) + node.lemma = lemma + elif node.upos == 'ADJ': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r'_ASS$', '', morphind) + # Do not proceed if there is an unexpected final XPOS tag. + if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r'\+', morphind) + # Expected prefix is ter-. + if len(morphemes) > 1 and re.match(r'^ter$', morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r'<[a-z]+>', '', lemma) + node.lemma = lemma + else: + logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) + + def process_node(self, node): + self.fix_plural_propn(node) + self.fix_upos_based_on_morphind(node) + self.fix_semua(node) + self.rejoin_ordinal_numerals(node) + self.fix_ordinal_numerals(node) + self.rejoin_decades(node) + self.merge_reduplication(node) + self.fix_satu_satunya(node) + self.lemmatize_from_morphind(node) diff --git a/udapi/block/ud/jointoken.py b/udapi/block/ud/jointoken.py new file mode 100644 index 00000000..43d2b30d --- /dev/null +++ b/udapi/block/ud/jointoken.py @@ -0,0 +1,97 @@ +""" +Block ud.JoinToken will join a given token with the preceding one. +""" +from udapi.core.block import Block +import logging + + +class JoinToken(Block): + """ + Merge two tokens into one. A MISC attribute is used to mark the tokens that + should join the preceding token. (The attribute may have been set by an + annotator or by a previous block that tests the specific conditions under + which joining is desired.) Joining cannot be done across sentence + boundaries; if necessary, apply util.JoinSentence first. Multiword tokens + are currently not supported: None of the nodes to be merged can belong to + a MWT. (The block ud.JoinAsMwt may be of some help, but it works differently.) + Merging is simple if there is no space between the tokens (see SpaceAfter=No + at the first token). If there is a space, there are three options in theory: + + 1. Keep the tokens as two nodes but apply the UD goeswith relation + (see https://universaldependencies.org/u/overview/typos.html) and + the related annotation rules. + 2. Join them into one token that contains a space. Such "words with + spaces" can be exceptionally allowed in UD if they are registered + in the given language. + 3. Remove the space without any trace. Not recommended in UD unless the + underlying text was created directly for UD and can be thus considered + part of the annotation. + + At present, this block does not support merging with spaces at all, but + in the future one or more of the options may be added. + """ + + def __init__(self, misc_name='JoinToken', misc_value=None, **kwargs): + """ + Args: + misc_name: name of the MISC attribute that can trigger the joining + default: JoinToken + misc_value: value of the MISC attribute to trigger the joining; + if not specified, then simple occurrence of the attribute with any value will cause the joining + MISC attributes that have triggered sentence joining will be removed from their node. + """ + super().__init__(**kwargs) + self.misc_name = misc_name + self.misc_value = misc_value + + def process_node(self, node): + """ + The JoinToken (or equivalent) attribute in MISC will trigger action. + Either the current node will be merged with the previous node and the + attribute will be removed from MISC, or a warning will be issued that + the merging cannot be done and the attribute will stay in MISC. Note + that multiword token lines and empty nodes are not even scanned for + the attribute, so if it is there, it will stay there but no warning + will be printed. + """ + if node.misc[self.misc_name] == '': + return + if self.misc_value and node.misc[self.misc_name] != self.misc_value: + return + prevnode = node.prev_node + if not prevnode: + logging.warning("MISC %s cannot be used at the first token of a sentence." % self.misc_name) + node.misc['Bug'] = 'JoiningTokenNotSupportedHere' + return + if node.multiword_token or prevnode.multiword_token: + logging.warning("MISC %s cannot be used if one of the nodes belongs to a multiword token." % self.misc_name) + node.misc['Bug'] = 'JoiningTokenNotSupportedHere' + return + if prevnode.misc['SpaceAfter'] != 'No': + logging.warning("MISC %s cannot be used if there is space between the tokens." % self.misc_name) + node.misc['Bug'] = 'JoiningTokensWithSpaceNotSupported' + return + ###!!! This block currently must not be applied on data containing + ###!!! enhanced dependencies. We must first implement adjustments of + ###!!! the enhanced structure. + if prevnode.deps or node.deps: + logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') + # If the first token depends on the second token, re-attach it to the + # second token's parent to prevent cycles. + if prevnode in node.descendants: + prevnode.parent = node.parent + prevnode.deprel = node.deprel + # Re-attach all children of the second token to the first token. + for c in node.children: + c.parent = prevnode + # Concatenate the word forms of the two tokens. Assume that morphological + # annotation, including the lemma, is already updated accordingly (we + # cannot guess it anyway). + prevnode.form += node.form + # Remove SpaceAfter=No from the first token unless the second token has + # this attribute, too (meaning that there is no space between the second + # token and whatever comes next). + prevnode.misc['SpaceAfter'] = node.misc['SpaceAfter'] + # Remove the current node. The joining instruction was in its MISC, so + # it will disappear together with the node. + node.remove() diff --git a/udapi/block/ud/kk/fixspuriousaux.py b/udapi/block/ud/kk/fixspuriousaux.py new file mode 100644 index 00000000..044ff178 --- /dev/null +++ b/udapi/block/ud/kk/fixspuriousaux.py @@ -0,0 +1,27 @@ +"""Block to convert spurious auxiliaries to lexical verbs in Kazakh.""" +from udapi.core.block import Block +import logging +import re + +class FixSpuriousAux(Block): + + def process_node(self, node): + """ + Some verbs that are called auxiliary by the traditional grammar, should + be analyzed in UD as VERB + non-finite xcomp. + """ + if node.upos == 'AUX' and node.udeprel == 'aux': + # баста = start + if re.match(r'^(баста|кет)$', node.lemma): + node.upos = 'VERB' + # The auxiliary inherits the incoming relation of its original parent. + lexverb = node.parent + node.parent = lexverb.parent + node.deprel = lexverb.deprel + # The auxiliary also inherits some but not all children of the lexical verb. + for c in lexverb.children: + if re.match(r'^(nsubj|csubj|obl|advmod|advcl|vocative|discourse|parataxis|punct)$', c.udeprel): + c.parent = node + # The lexical verb becomes an xcomp of the auxiliary. + lexverb.parent = node + lexverb.deprel = 'xcomp' diff --git a/udapi/block/ud/la/addmwt.py b/udapi/block/ud/la/addmwt.py new file mode 100644 index 00000000..27831151 --- /dev/null +++ b/udapi/block/ud/la/addmwt.py @@ -0,0 +1,41 @@ +""" Block ud.la.AddMwt for heuristic detection of multi-word (PRON + cum, nonne) and abbreviations-dots tokens. """ +import udapi.block.ud.addmwt + +MWTS = { + 'mecum': {'lemma': 'ego cum', 'form': 'me cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'tecum': {'lemma': 'tu cum', 'form': 'te cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'nobiscum': {'lemma': 'nos cum', 'form': 'nobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Neut|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'vobiscum': {'lemma': 'vos cum', 'form': 'vobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'uobiscum': {'lemma': 'uos cum', 'form': 'uobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'secum': {'lemma': 'sui cum', 'form': 'se cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, # can be singular or plural + 'nonne': {'lemma': 'non ne', 'form': 'non ne', 'upos': 'PART PART', 'feats': 'Polarity=Neg Clitic=Yes|PartType=Int', 'deprel': 'advmod:neg discourse', 'shape': 'sibling'} +} + +# shared values for all entries in MWTS +for v in MWTS.values(): + # v['xpos'] = '' # treebank-specific + if 'shape' not in v: + v['shape'] = 'subtree' + v['main'] = 0 + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + analysis = MWTS.get(node.form.lower(), None) + if analysis is not None: + return analysis + + if node.form.endswith('.') and len(node.form) > 1 and node.form != '...': + # currently under discussion + return {'form': node.form[:-1] + ' .', + 'lemma': '* .', + 'upos': '* PUNCT', + 'xpos': '_ _', + 'feats': '* _', + 'deprel': '* punct', + 'main': 0, + 'shape': 'subtree'} + diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py new file mode 100644 index 00000000..a7b506e8 --- /dev/null +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -0,0 +1,338 @@ +""" +Block to identify missing or ill-valued features in Latin. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.la.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import logging +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + def __init__(self, flavio=False, **kwargs): + """ + Create the ud.la.MarkFeatsBugs block instance. + + Args: + flavio=1: Accept features as defined by Flavio for treebanks he + maintains. By default, a more conservative set of features and + values is expected. + """ + super().__init__(**kwargs) + self.flavio = flavio + + def process_node(self, node): + rf = [] + af = {} + # PROIEL-specific: greek words without features + # LLCT-specific: corrupted nodes + if node.lemma in ['greek.expression', 'missing^token']: + pass + # NOUNS ################################################################ + elif node.upos == 'NOUN': + if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns + rf = ['Gender', 'Number', 'Case'] + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Dim'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'VerbForm': ['Part', 'Vnoun']} + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['Proper'] = ['Yes'] + af['Polarity'] = ['Neg'] + af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # PROPER NOUNS ######################################################### + elif node.upos == 'PROPN': + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: # abbreviated and indeclinable nouns + rf = ['Gender', 'Number', 'Case'] + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes']} + if self.flavio: + af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: + rf = ['Gender', 'Number', 'Case'] + af = { + 'NumType': ['Dist', 'Mult', 'Ord'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Cmp', 'Sup', 'Abs'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Polarity': ['Neg'], + 'VerbForm': ['Part']} + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['Compound'] = ['Yes'] + af['Proper'] = ['Yes'] + af['Variant'] = ['Greek'] + af['Degree'].append('Dim') + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + rf = ['PronType', 'Case'] + af = { + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Proper': ['Yes'], + 'Compound': ['Yes'], + 'Polarity': ['Neg'] + } + if node.feats['PronType'] == 'Prs': + af['Reflex'] = ['Yes'] + if node.feats['Reflex'] == 'Yes': # seipsum, se + rf.extend(['Person']) + # seipsum has gender and number but se does not, so it is not required + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + af['Person'] = ['3'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Abl'] + else: # not reflexive: ego, tu, is, nos + rf.extend(['Person', 'Number']) + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + # 3rd person must have gender + if node.feats['Person'] == '3': # is, id + rf.append('Gender') + af['Gender'] = ['Masc', 'Fem', 'Neut'] + elif re.match(r'^(Rel|Int)$', node.feats['PronType']): + rf.extend(['Gender', 'Number']) + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + elif node.feats['PronType'] == 'Ind': + rf = [f for f in rf if f != 'Case'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + # lexical check of PronTypes + af['PronType'] = [] + if node.lemma in ['ego', 'tu', 'is', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'egoipse', 'egometipse', 'tumetipse', 'semetipse', 'nosmetipse']: + af['PronType'].append('Prs') + elif node.lemma in ['aliquis', 'nemo', 'nihil', 'nihilum', 'qui', 'quis', 'quisquis', 'quiuis', 'quivis']: + af['PronType'].append('Ind') + elif node.lemma in ['inuicem', 'invicem']: + af['PronType'].append('Rcp') + rf.remove('Case') + if node.lemma in ['qui', 'quicumque', 'quisquis']: + af['PronType'].append('Rel') + if node.lemma in [ 'ecquis', 'ecqui', 'numquis', 'qui', 'quis', 'quisnam']: + af['PronType'].append('Int') + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurO', 'IndEurX', 'LatAnom', 'LatPron'] + af['Compound'] = ['Yes'] + af['Polarity'] = ['Neg'] + af['Form'] = ['Emp'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + rf = ['PronType'] + if node.feats['Case']: + rf.extend(['Gender', 'Number', 'Case']) + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Cmp', 'Abs', 'Sup'], + 'Polarity': ['Neg'], + 'Proper': ['Yes'], + 'PronType': [] + } + if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' + rf.extend(['Poss', 'Person[psor]']) + af['PronType'] = ['Prs'] + af['Poss'] = 'Yes' + af['Person[psor]'] = ['1', '2', '3'] + af['Reflex'] = ['Yes'] + # The possessor's number is distinguished in the first and second person (meus vs. noster) but not in the third person (suus). + if node.feats['Person[psor]'] != '3': + rf.append('Number[psor]') + af['Number[psor]'] = ['Sing', 'Plur'] + if node.feats['PronType'] == 'Ind': + af['NumType'] = ['Card'] + # lexical check of PronTypes + if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']: + if not af['PronType'] == ['Prs']: + af['PronType'].append('Prs') + elif node.lemma in ['aliquantus', 'aliqui', 'aliquot', 'quidam', 'nonnullus', 'nullus', 'quantuscumque', 'quantuslibet', 'qui', 'quilibet', 'quispiam', 'quiuis', 'quivis', 'quotlibet', 'ullus', 'unus', 'uterque','multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: + af['PronType'].append('Ind') + elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: + af['PronType'].append('Tot') + if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']: + af['PronType'].append('Rel') + if node.lemma in ['qui', 'quantus', 'quot']: + af['PronType'].append('Int') + elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot', 'praedictus', 'praefatus', 'suprascriptus']: + af['PronType'].append('Dem') + elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter', 'uterlibet', 'uterque']: + af['PronType'].append('Con') + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['NumType'] = ['Card'] + af['Degree'].append('Dim') + af['PronType'].append('Art') + if re.match(r'^(unus|ambo)', node.lemma): + af['NumValue'] = ['1', '2'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + rf = ['NumType', 'NumForm'] + af = { + 'NumType': ['Card', 'Ord'], + 'NumForm': ['Word', 'Roman', 'Digit'], + 'Proper': ['Yes']} + # Arabic digits and Roman numerals do not have inflection features. + if not re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. # e.g. duodecim + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['NumForm'].append('Reference') + af['Compound'] = ['Yes'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # VERBS AND AUXILIARIES ################################################ + elif re.match(r'^(VERB|AUX)$', node.upos): + rf = ['VerbForm', 'Aspect'] + af = { + 'VerbForm': ['Inf', 'Fin', 'Part', 'Conv'], + 'Aspect': ['Imp', 'Inch', 'Perf', 'Prosp'], + 'Polarity': ['Neg'], + 'Typo': ['Yes'] + } + if node.feats['VerbForm'] not in ['Part', 'Conv']: + rf.append('Tense') + af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut'] + if node.upos == 'VERB' or (node.upos == 'AUX' and node.lemma != 'sum'): + rf.append('Voice') + af['Voice'] = ['Act', 'Pass'] + if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive + rf.extend(['Mood', 'Person', 'Number']) + af['Mood'] = ['Ind', 'Sub', 'Imp'] + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + elif node.feats['VerbForm'] == 'Part': + rf.extend(['Gender', 'Number', 'Case']) + af['Number'] = ['Sing', 'Plur'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Sing'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Neut'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + af['Degree'] = ['Abs', 'Cmp'] + if node.misc['TraditionalMood'].startswith('Gerundi'): + af['Voice'] = ['Pass'] + af['Aspect'] = 'Prosp' + elif node.feats['VerbForm'] == 'Conv': + rf.extend(['Case', 'Gender', 'Number']) + af['Case'] = ['Abl', 'Acc'] + af['Gender'] = ['Masc'] + af['Number'] = ['Sing'] + af['Voice'] = ['Act'] + elif node.feats['VerbForm'] == 'Inf': + af['Tense'].remove('Pqp') + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] + af['VerbType'] = ['Mod'] + if 'Degree' in af: + af['Degree'].append('Dim') + else: + af['Degree'] = ['Dim'] + af['Compound'] = ['Yes'] + af['Proper'] = ['Yes'] + if re.match(r'^(Part|Conv)$', node.feats['VerbForm']): + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + elif node.feats['VerbForm'] == 'Inf': + af['Case'] = ['Nom', 'Acc', 'Abl'] + af['Gender'] = ['Neut'] + af['Number'] = ['Sing'] + af['InflClass[nominal]'] = ['Ind'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + af = { + 'AdvType': ['Loc', 'Tim'], + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'NumType': ['Card', 'Mult', 'Ord'], # e.g., primum + 'Polarity': ['Neg'] + } + if self.flavio: + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['VerbForm'] = ['Fin', 'Part'] + af['Degree'].append('Dim') + self.check_allowed_features(node, af) + # PARTICLES ############################################################ + elif node.upos == 'PART': + af = { + 'PartType': ['Int', 'Emp'], + 'Polarity': ['Neg'] + } + if self.flavio: + af['Form'] = ['Emp'] + af['PronType'] = ['Dem'] + af['Compound'] = ['Yes'] + self.check_allowed_features(node, af) + # CONJUNCTIONS ######################################################### + elif re.match(r'^[CS]CONJ$', node.upos): + af = { + 'PronType': ['Rel', 'Con'], + 'Polarity': ['Neg'], + 'Compound': ['Yes']} + if self.flavio: + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['VerbForm'] = ['Fin'] + af['NumType'] = ['Card'] + af['ConjType'] = ['Expl'] + af['AdvType'] = ['Loc'] + self.check_allowed_features(node, af) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + rf = ['AdpType'] + af = { + 'AdpType': ['Prep', 'Post'], + 'Abbr': ['Yes'] + } + if self.flavio: + af['VerbForm'] = ['Part'] + af['Proper'] = ['Yes'] + af['Compound'] = ['Yes'] + self.check_allowed_features(node, af) + # X ########################################################## + elif node.upos == 'X': + af = {'Abbr': ['Yes']} + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) diff --git a/udapi/block/ud/lemmatize.py b/udapi/block/ud/lemmatize.py new file mode 100644 index 00000000..a234256f --- /dev/null +++ b/udapi/block/ud/lemmatize.py @@ -0,0 +1,42 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + def process_node(self, node): + """ + Some treebanks lack lemmas for some or all words. Occasionally we may be + able to guess that the lemma is identical to the word form. This block + will then fill out the lemma. + + For some parts of speech, we can only say that the form is the lemma if + we have morphological features that will confirm it is the right form. + """ + if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': + # Many closed classes do not inflect and have the same lemma as the form (just lowercased). + if re.match(r'^(PUNCT|SYM|ADP|CCONJ|SCONJ|PART|INTJ|X)$', node.upos): + node.lemma = node.form.lower() + # NOUN PROPN ADJ PRON DET NUM VERB AUX ADV + # ADV: use positive affirmative + elif re.match(r'^(ADV)$', node.upos) and re.match(r'^(Pos)?$', node.feats['Degree']) and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form.lower() + # VERB and AUX: use the infinitive + elif re.match(r'^(VERB|AUX)$', node.upos) and node.feats['VerbForm'] == 'Inf' and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form.lower() + # NOUN and PROPN: use singular nominative (but do not lowercase for PROPN) + # Note: This rule is wrong in German, where no nouns should be lowercased. + elif re.match(r'^(NOUN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form.lower() + elif re.match(r'^(PROPN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form + # ADJ: use masculine singular nominative positive affirmative + elif re.match(r'^(ADJ)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Degree']) and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form.lower() + # ADJ, PRON, DET: use masculine singular nominative (pronouns: each person has its own lemma) + elif re.match(r'^(ADJ|PRON|DET)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): + node.lemma = node.form.lower() + # NUM: use masculine nominative (number, if present at all, is lexical) + elif re.match(r'^(NUM)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Nom)?$', node.feats['Case']): + node.lemma = node.form.lower() diff --git a/udapi/block/ud/lt/fixedeprels.py b/udapi/block/ud/lt/fixedeprels.py new file mode 100644 index 00000000..9b1cb98d --- /dev/null +++ b/udapi/block/ud/lt/fixedeprels.py @@ -0,0 +1,144 @@ +"""Block to fix case-enhanced dependency relations in Lithuanian.""" +from udapi.core.block import Block +import logging +import re + +class FixEdeprels(Block): + + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'jako' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'kaip': [], + 'lyg': [], + 'negu': [], + 'nei': [], + 'nes': [] + } + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'apie': 'apie:acc', # about (topic) + 'dėl': 'dėl:gen', # because of + 'iki': 'iki:gen', # until + 'iš': 'iš:gen', # from, out of + 'į': 'į:acc', # to, into, in + 'jei': 'jei', # remove morphological case # if + 'jeigu': 'jeigu', # remove morphological case # if + 'jog': 'jog', # remove morphological case # because + 'kadangi': 'kadangi', # remove morphological case # since, because + 'kai': 'kai', # remove morphological case # when + 'kaip': 'kaip', # remove morphological case # as, than + 'lyg': 'lyg', # remove morphological case # like + 'negu': 'negu', # remove morphological case # than + 'nei': 'nei', # remove morphological case # more than + 'nes': 'nes', # remove morphological case # because + 'nors': 'nors', # remove morphological case # though, although, when, if + 'nuo': 'nuo:gen', # from + 'pagal': 'pagal:acc', # according to, under, by + 'pagal_dėl': 'pagal:acc', + 'per': 'per:acc', # through, over (přes) + 'prie': 'prie:gen', # to, at, near, under + 'prieš': 'prieš:acc', # against + 'su': 'su:ins', # with + 'tarp': 'tarp:gen', # between + 'tarsi': 'tarsi', # remove morphological case # as if + 'virš': 'virš:gen' # above + } + + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Czech basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + # Issues caused by errors in the original annotation must be fixed early. + # Especially if acl|advcl occurs with a preposition that unambiguously + # receives a morphological case in the subsequent steps, and then gets + # flagged as solved. + edep['deprel'] = re.sub(r'^advcl:do(?::gen)?$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + edep['deprel'] = re.sub(r'^acl:k(?::dat)?$', r'acl', edep['deprel']) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'jako_v' becomes just 'jako'. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) + if m and m.group(2) and not x+m.group(2) in exceptions: + edep['deprel'] = m.group(1)+':'+x + solved = True + break + if solved: + continue + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + if solved: + continue + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. Exclude 'nom' and 'voc', which cannot + # be correct. + m = re.match(r'^(obl(?::arg)?|nmod):(po|už)(?::(?:nom|voc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase and not re.search(r':(nom|voc)$', adpcase): + edep['deprel'] = m.group(1)+':'+adpcase + continue + # The remaining instance of 'po' should be ':acc'. + elif m.group(2) == 'po': + edep['deprel'] = m.group(1)+':po:acc' + continue + # The remaining 'už' are ':acc' (they are second conjuncts + # in coordinated oblique modifiers). + elif m.group(2) == 'už': + edep['deprel'] = m.group(1)+':už:acc' + continue + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index cbd57eef..959b484e 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -118,7 +118,7 @@ def process_node(self, node): if upos == i_upos and not feats[i_feat]: # Some languages do not distinguish finite and non-finite forms of verbs. # The VerbForm feature is not obligatory in those languages. - if i_feat != "VerbForm" or not node.root.zone.split("_")[0] in {"id", "tl", "hil", "ifb"}: + if i_feat != 'VerbForm' or not node.root.zone.split('_')[0] in {'id', 'jv', 'tl', 'hil', 'ifb'}: self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat)) if feats['VerbForm'] == 'Fin': @@ -127,22 +127,22 @@ def process_node(self, node): if not feats['Mood']: self.log(node, 'finverb-mood', 'VerbForm=Fin but Mood feature is missing') - if feats['Degree'] and upos not in ('ADJ', 'ADV'): - self.log(node, 'degree-upos', - 'Degree=%s upos!=ADJ|ADV (but %s)' % (feats['Degree'], upos)) - - subject_children = [n for n in node.children if 'subj' in n.udeprel] + subject_children = [n for n in node.children if 'subj' in n.udeprel and n.sdeprel != 'outer'] if len(subject_children) > 1: - self.log(node, 'multi-subj', 'More than one [nc]subj(:pass)? child') - - object_children = [n for n in node.children if n.udeprel in ('obj', 'ccomp')] + self.log(node, 'multi-subj', 'More than one (non-outer) [nc]subj child') + + # Since "ccomp" is considered a clausal counterpart of "obj" in UD v2, + # one may conclude that "obj" and "ccomp" are mutually exclusive. + # However, this has always be a gray zone and people have occasionally + # brought up examples where they would want the two relations to co-occur. + # Also, there is no clausal counterpart for "iobj", which may cause some + # of the problems. It is probably safer not to consider "ccomp" in this + # test. Nevertheless, two "obj" under the same parent are definitely an + # error. + object_children = [n for n in node.children if n.udeprel == 'obj'] if len(object_children) > 1: self.log(node, 'multi-obj', 'More than one obj|ccomp child') - # In addition to http://universaldependencies.org/svalidation.html - if parent.udeprel == 'punct': - self.log(node, 'punct-child', 'parent.deprel=punct') - # See http://universaldependencies.org/u/overview/syntax.html#the-status-of-function-words # TODO: Promotion by Head Elision: It is difficult to detect this exception. # So far, I have just excluded "det" from the forbidded parent.deprel set @@ -154,7 +154,7 @@ def process_node(self, node): # so there should be no false alarms. Some errors are not reported, i.e. the cases # when advmod incorrectly depends on a function word ("right before midnight"). if parent.udeprel in ('aux', 'cop', 'mark', 'clf', 'case'): - if udeprel not in ('conj', 'cc', 'punct', 'fixed', 'goeswith', 'advmod'): + if udeprel not in ('conj', 'cc', 'punct', 'fixed', 'goeswith', 'advmod', 'reparandum'): self.log(node, parent.deprel + '-child', 'parent.deprel=%s deprel!=conj|cc|punct|fixed|goeswith' % parent.deprel) @@ -184,14 +184,6 @@ def process_node(self, node): if upos == 'PUNCT' and node.is_nonprojective_gap() and not parent.is_nonprojective_gap(): self.log(node, 'punct-nonproj-gap', 'upos=PUNCT and causing a non-projectivity') - # http://universaldependencies.org/u/dep/cc.html says - # "cc is the relation between a conjunct and a preceding - # [coordinating conjunction](http://universaldependencies.org/u/pos/CCONJ)." - # No other upos is allowed in the documentation, although e.g. PART is common in the data. - # There are clear cases of adverbs in role of cc (e.g. "respektive" in Swedish and Czech). - if udeprel == 'cc' and upos not in ('CCONJ', 'ADV'): - self.log(node, 'cc-upos', "deprel=cc upos!=CCONJ (but %s): " % upos) - if udeprel == 'cop': lemma = node.lemma if node.lemma != '_' else form self.cop_nodes[lemma].append(node) diff --git a/udapi/block/ud/markfeatsbugs.py b/udapi/block/ud/markfeatsbugs.py new file mode 100644 index 00000000..26c5624d --- /dev/null +++ b/udapi/block/ud/markfeatsbugs.py @@ -0,0 +1,73 @@ +""" +Block to identify missing or ill-valued features in a treebank. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. This is a base block that only +implements service methods. A language-specific block must be derived from this +one and define the actual rules valid in that language. + +Usage (Czech example): cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html +""" +from udapi.core.block import Block + +class MarkFeatsBugs(Block): + + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def check_allowed_features(self, node, allowed): + """ + We need a dictionary indexed by feature names that are allowed; for each + feature name, there is a list of allowed values. + """ + # Check for features that are not allowed but the node has them. + # For features that are allowed, check that their values are allowed. + for f in node.feats: + if f in allowed: + if not node.feats[f] in allowed[f]: + self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed') + else: + self.bug(node, 'Feat' + f + 'NotAllowed') + + def check_required_features(self, node, required): + """ + We need a list of names of features whose values must not be empty. + """ + for f in required: + if not f in node.feats: + self.bug(node, 'Feat' + f + 'Missing') + + def process_node(self, node): + """ + This is a generic block, do nothing here. In a language-specific block + based on this one, rules similar to the examples below can be specified: + + # NOUNS ################################################################ + if node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + #... + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) + """ + return diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py new file mode 100644 index 00000000..13c8434c --- /dev/null +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -0,0 +1,279 @@ +""" +Block to identify missing or ill-valued features in Malayalam. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.ml.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.ml.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import logging +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + def process_node(self, node): + # FOREIGN WORDS ######################################################## + # Do not put any restrictions on words that have Foreign=Yes. These may + # also have Lang=xx in MISC, which would mean that the official + # validator would judge them by the rules for language [xx]. But even + # if they are not fully code-switched (e.g. because they are written in + # the Malayalam script, like the English verb പ്ലാന്റ് plānṟ "plant"), + # they still may not have the regular features of Malayalam morphology. + if node.feats['Foreign'] == 'Yes': + pass + # NOUNS AND PROPER NOUNS ############################################### + elif re.match(r'^(NOUN|PROPN)$', node.upos): + self.check_required_features(node, ['Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'NumType': ['Ord'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + rf = ['PronType', 'Case'] + af = { + 'PronType': ['Prs', 'Int', 'Ind'], # demonstrative pronouns are treated as third person personal pronouns + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + } + if node.feats['PronType'] == 'Prs': + af['Reflex'] = ['Yes'] + if node.feats['Reflex'] == 'Yes': + rf = ['PronType'] + else: # not reflexive + rf.extend(['Person', 'Number']) + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + # 1st and 2nd person do not have gender: ഞാൻ ñān, നീ nī; or 3rd person താൻ tān̕ + if node.feats['Person'] == '3' and not node.lemma == 'താൻ': # അവൻ avan, അവൾ avaḷ, അത് at, അവർ avaṟ; but not താൻ tān̕ + rf.append('Deixis') + af['Deixis'] = ['Prox', 'Remt'] + if node.feats['Number'] == 'Sing': + rf.append('Gender') + af['Gender'] = ['Masc', 'Fem', 'Neut'] + # third person singular neuter pronouns also distinguish animacy (animate neuter are animals and plants, they have a different accusative form) + if node.feats['Gender'] == 'Neut': + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] + else: # plural pronouns do not distinguish gender but they do distinguish animacy + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] + elif node.feats['Person'] == '1' and node.feats['Number'] == 'Plur': + rf.append('Clusivity') + af['Clusivity'] = ['In', 'Ex'] + # Interrogative pronouns, too, can be case-marked. Therefore, the + # base form must have Case=Nom. + # ആര് ār "who" (Nom) എന്ത് ent "what" (Nom, Acc.Inan) + # ആരെ āre "who" (Acc) എന്തെ ente "what" (Acc.Anim) എന്തിനെ entine "what" (Acc.Anim or maybe Inan but optional) + # ആരുടെ āruṭe "who" (Gen) എന്തിന് entin "what" (Gen) or "why" + # ആരൊക്കെ ārokke "who" (Dat?) എന്തൊക്കെ entokke "what" (Dat?) + #elif node.feats['PronType'] == 'Int': + # rf.append('Animacy') + # af['Animacy'] = ['Anim', 'Inan'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + if node.feats['PronType'] == 'Art': + self.check_required_features(node, ['PronType', 'Definite']) + self.check_allowed_features(node, { + 'PronType': ['Art'], + 'Definite': ['Ind'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: + self.check_required_features(node, ['PronType']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'Deixis': ['Prox', 'Remt'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card', 'Frac'], + 'NumForm': ['Word'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # VERBS ################################################################ + elif node.upos == 'VERB': + self.check_required_features(node, ['VerbForm']) + if node.feats['VerbForm'] == 'Inf': + self.check_allowed_features(node, { + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + elif node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + # Unlike other forms, the imperative distinguishes politeness. + # The verb stem serves as an informal imperative: തുറ tuṟa "open" + # The citation form may serve as a formal imperative: തുറക്കുക tuṟakkūka "open" + # Finally, there is another formal imperative with -kkū: തുറക്കൂ tuṟakkū "open" + self.check_required_features(node, ['Mood', 'Polite']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'], + 'Polite': ['Infm', 'Form'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + elif node.feats['Mood'] == 'Nec': + self.check_required_features(node, ['Mood', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Nec'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + else: + self.check_required_features(node, ['Mood', 'Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Pot', 'Cnd'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + elif node.feats['VerbForm'] == 'Part': + self.check_required_features(node, ['Tense']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + else: # verbal noun + # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice. + # Currently both forms are VerbForm=Vnoun. + #self.check_required_features(node, ['Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + # AUXILIARIES ########################################################## + elif node.upos == 'AUX': + self.check_required_features(node, ['VerbForm']) + if node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: # indicative or subjunctive + self.check_required_features(node, ['Mood', 'Tense']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Sub', 'Cnd'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: # verbal noun + # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice. + # Currently both forms are VerbForm=Vnoun. + #self.check_required_features(node, ['Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'], + # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'Typo': ['Yes'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {'Typo': ['Yes']}) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_allowed_features(node, { + # Case suffixes after numbers are separate tokens, they are attached + # via the 'case' relation and they bear the Case feature (the number does not). + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes']}) + # PARTICLES ############################################################ + elif node.upos == 'PART': + self.check_allowed_features(node, { + 'Polarity': ['Neg'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {'Abbr': ['Yes'], 'Typo': ['Yes']}) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py new file mode 100644 index 00000000..bd63ee7d --- /dev/null +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -0,0 +1,94 @@ +""" +Block ud.mr.AddFormsInMwt looks for multiword tokens whose words lack forms. +Based on the form of the surface token and on the information provided in +the lemmas and UPOS, tries to reconstruct the forms of individual words. +""" +from udapi.core.block import Block +import re +import logging + + +class AddFormsInMwt(Block): + """Guess forms of syntactic worms within a multiword token.""" + + def process_node(self, node): + if node.form == '_' and node.multiword_token: + mwt = node.multiword_token + # Many multiword tokens consist of NOUN + ADP. Beware: The adposition + # may have a form different from its lemma. It happens with possessive + # postpositions चा, चे, which distinguish the gender and number of + # the possessed entity. + if len(mwt.words) == 2 and re.match(r'^(ADP|PART)$', mwt.words[1].upos): + # Occasionally the lemma of the possessive postposition is mistakenly 'ची' instead of 'चा'. + if mwt.words[1].lemma == 'चा' or mwt.words[1].lemma == 'ची': + mwt.words[1].lemma = 'चा' + # चा (cā) ... Masc Sing + # ची (cī) ... Fem Sing, Neut Plur + # चे (ce) ... Neut Sing, Masc Plur + # च्या (cyā) ... Fem Plur + # चं (caṁ) ... ? + m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)$', mwt.form) + # The resulting form is different with personal pronouns. + # माझा (mājhā), माझी (mājhī), माझे (mājhe), माझ्या (mājhyā) + # तुझी (tujhī), तुझे (tujhe) + # आपला (āpalā), आपली (āpalī), आपल्या (āpalyā) + # त्याचं (tyācaṁ) + m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = m.group(2) + elif m2: + if node == mwt.words[0]: + node.form = m2.group(1) + else: + node.form = 'च' + m2.group(2) + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + elif mwt.words[1].lemma == 'वरती': + m = re.match(r'^(.+)(वर(?:ती)?)$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = m.group(2) + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + else: # not the possessive 'चा' + m = re.match(r'^(.+)' + mwt.words[1].lemma + r'$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = node.lemma + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + elif len(mwt.words) == 3 and re.match(r'^(ADP|PART)$', mwt.words[1].upos) and re.match(r'^(ADP|PART)$', mwt.words[2].upos): + # Compound postpositions where the middle word is the possessive 'चा'. + # The lemma of the middle word should be 'चा' but sometimes it is 'च्या'. + if re.match(r'^(चा|च्या)$', mwt.words[1].lemma): + m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)(.+)$', mwt.form) + m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)(.+)$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + elif node == mwt.words[1]: + node.form = m.group(2) + node.lemma = 'चा' + else: + node.form = m.group(3) + elif m2: + if node == mwt.words[0]: + node.form = m2.group(1) + elif node == mwt.words[1]: + node.form = 'च' + m2.group(2) + node.lemma = 'चा' + else: + node.form = m2.group(3) + else: + logging.info("Cannot decompose %s+%s+%s multiword token '%s'. Part lemmas are '%s', '%s', and '%s'." % (mwt.words[0].upos, mwt.words[1].upos, mwt.words[2].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma, mwt.words[1].lemma)) + else: + logging.info("Cannot decompose %s+%s+%s multiword token '%s'. Part lemmas are '%s', '%s', and '%s'." % (mwt.words[0].upos, mwt.words[1].upos, mwt.words[2].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma, mwt.words[1].lemma)) + else: + logging.info("Cannot decompose multiword token '%s' of %d parts: %s" % (mwt.form, len(mwt.words), str([x.lemma + '/' + x.upos for x in mwt.words]))) diff --git a/udapi/block/ud/printfixed.py b/udapi/block/ud/printfixed.py new file mode 100644 index 00000000..313943bb --- /dev/null +++ b/udapi/block/ud/printfixed.py @@ -0,0 +1,104 @@ +""" +Block PrintFixed prints occurrences of fixed multiword expressions in UD. It +can be run twice in a row, first collecting known fixed expressions and then +also reporting other occurrences of these expressions where they are not +annotated as fixed. + +Usage: +udapy ud.PrintFixed only_forms=1 < in.conllu | sort -u > fixed_expressions.txt +udapy ud.PrintFixed known_expressions=fixed_expressions.txt < in.conllu | sort | uniq -c | less + +Author: Dan Zeman +""" +from udapi.core.block import Block +import re +import logging + +class PrintFixed(Block): + """ + Print fixed multiword expressions. + """ + + def __init__(self, only_forms=False, known_expressions=None, **kwargs): + """ + Create the PrintFixed block. + + Parameters: + only_forms=1: print the word forms but not tags and other info; + This can be used to create the list of known forms that we want to + identify even if they are not annotated as fixed. + known_expressions: the name of the text file with the expressions + """ + super().__init__(**kwargs) + self.only_forms = only_forms + self.known_expressions = {} + self.first_words = {} + self.max_length = 2 + if known_expressions: + fh = open(known_expressions, 'r', encoding='utf-8') + n = 0 + for expression in fh.readlines(): + expression = expression.replace('\n', '') + if expression in self.known_expressions: + self.known_expressions[expression] += 1 + else: + self.known_expressions[expression] = 1 + logging.info("Read known fixed expression '%s'" % expression) + n += 1 + words = expression.split(' ') + first_word = words[0] + self.first_words[first_word] = 1 + length = len(words) + if length > self.max_length: + self.max_length = length + logging.info('Read %d known fixed expressions.' % n) + + def process_node(self, node): + fixed_children = [x for x in node.children if x.udeprel == 'fixed'] + if len(fixed_children) > 0: + # Fixed children are always to the right of of the parent. But there + # may be other nodes in between that are not fixed children (for + # example, there may be punctuation that is attached to one of the + # fixed nodes). + n = node + list_of_forms = [node.form.lower()] + list_of_tags = [node.upos] + while n != fixed_children[-1]: + n = n.next_node + if n.parent == node and n.udeprel == 'fixed': + list_of_forms.append(n.form.lower()) + list_of_tags.append(n.upos) + else: + list_of_forms.append('X') + list_of_tags.append('X') + forms = ' '.join(list_of_forms) + tags = ' '.join(list_of_tags) + if self.only_forms: + print(forms) + else: + print("%s / %s / %s" % (forms, tags, node.deprel)) + else: + # If this is not the first word of a fixed expression, check whether + # something that looks like a known fixed expression starts here. + # Note that it is also possible that a known expression starts here + # but only a subset is actually marked as such; we currently do not + # account for this. + if node.form.lower() in self.first_words: + n = node + list_of_forms = [node.form.lower()] + list_of_tags = [node.upos] + for i in range(self.max_length - 1): + n = n.next_node + if not n: + break + ###!!! At present we cannot identify known expressions with gaps ('X'). + list_of_forms.append(n.form.lower()) + list_of_tags.append(n.upos) + forms = ' '.join(list_of_forms) + if forms in self.known_expressions: + if self.only_forms: + print(forms) + else: + tags = ' '.join(list_of_tags) + print("%s / %s / NOT FIXED" % (forms, tags)) + break diff --git a/udapi/block/ud/pt/addhyphenmwt.py b/udapi/block/ud/pt/addhyphenmwt.py new file mode 100644 index 00000000..9492b1a2 --- /dev/null +++ b/udapi/block/ud/pt/addhyphenmwt.py @@ -0,0 +1,37 @@ +"""Block ud.pt.AddHyphenMwt for transforming hyphen compounds into multiword tokens in Portuguese-GSD. + +See https://github.com/UniversalDependencies/UD_Portuguese-GSD/issues/39 +""" +from udapi.core.block import Block + +class AddHyphenMwt(Block): + + def _ok(self, token): + # The hyphen in "al-Assad" perhaps should be kept as a separate word. + return token.form.isalnum() and token.form.lower() != 'al' + + def process_tree(self, root): + tokens, i = root.token_descendants, 1 + while i+1 < len(tokens): + start_i = i-1 + if tokens[i].form == "-" and self._ok(tokens[i-1]) and self._ok(tokens[i+1]): + while i+3 < len(tokens) and tokens[i+2].form == "-" and self._ok(tokens[i+3]): + i += 2 + compound, words = tokens[start_i:i+2], [] + for token in compound: + words += token.words + heads = [w for w in words if w.parent not in words] + cuckolds = [w for w in words if w not in heads and any(c not in words for c in w.children)] + if len(heads) > 1: + for h in heads: + h.misc["ToDo"] = 'NonCatenaCompound' + elif cuckolds: + for c in cuckolds: + c.misc["ToDo"] = 'HasChildrenOutsideCompound' + else: + compound_form = "".join(t.form for t in compound) + for hyphen in compound[1::2]: + hyphen.remove() + root.create_multiword_token([w for w in words if w.form != '-'], compound_form) + root.text = None + i += 1 diff --git a/udapi/block/ud/ro/fixfixed.py b/udapi/block/ud/ro/fixfixed.py new file mode 100644 index 00000000..14d16464 --- /dev/null +++ b/udapi/block/ud/ro/fixfixed.py @@ -0,0 +1,20 @@ +"""Block ud.ro.FixFixed + +Author: Dan Zeman +""" +import logging + +from udapi.core.block import Block + + +class FixFixed(Block): + """Block for fixing annotation of some 'fixed' expressions.""" + + def process_node(self, node): + fixchildren = [x for x in node.children if x.udeprel=='fixed'] + nfc = len(fixchildren) + if nfc > 0: + if node.udeprel == 'advmod' and node.feats['ExtPos'] == '': + node.feats['ExtPos'] = 'ADV' + elif node.feats['ExtPos'] == '': + logging.info('Another case: '+node.lemma+' '+' '.join([x.form for x in fixchildren])) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py new file mode 100644 index 00000000..6fa73460 --- /dev/null +++ b/udapi/block/ud/ru/fixedeprels.py @@ -0,0 +1,279 @@ +"""Block to fix case-enhanced dependency relations in Russian.""" +from udapi.core.block import Block +import logging +import re + +class FixEdeprels(Block): + + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'как' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('как_в:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'более_чем': [], + 'будто': [], + 'ведь': [], + 'ежели': [], + 'если': [], + 'как': ['как_только'], + 'когда': [], + 'кроме_как': [], + 'менее_чем': [], + 'минус': [], + 'нежели': [], + 'плюс': [], + 'пока': [], + 'поскольку': [], + 'потому_что': [], + 'пусть': [], + 'равно_как': [], + 'раз': [], + 'словно': [], + 'так_что': [], + 'хоть': [], + 'хотя': [], + 'чем': [], + 'что': [], + 'чтобы': [], + 'яко': [] + } + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'versus': 'версус:nom', + 'loc': 'в:loc', + 'в_вид': 'в_виде:gen', + 'в_во_глава': 'в:acc', # annotation error: 'входил в группу во главе с геологом' + 'в_для': 'в:acc', + 'в_качество': 'в_качестве:gen', + 'в_отношение': 'в_отношении:gen', + 'в_с': 'в:loc', # annotation error: 'в партнерстве с ACCELS' lacks the second level + 'в_связь_с': 'в_связи_с:ins', + 'в_случай_если': 'в_случае_если', + 'в_случай_когда': 'в_случае_когда', + 'в_соответствие_с': 'в_соответствии_с:ins', + 'в_течение': 'в_течение:gen', + 'в_то_быть': 'в:loc', + 'в_тот_время_как': 'в_то_время_как', + 'в_угода': 'в_угоду:dat', + 'в_ход': 'в_ходе:gen', + 'вблизи': 'вблизи:gen', + 'взамен': 'взамен:gen', + 'вместо': 'вместо:gen', + 'во_глава': 'во_главе_с:ins', + 'во_глава_с': 'во_главе_с:ins', + 'во_избежание': 'во_избежание:gen', + 'возле': 'возле:gen', + 'вокруг': 'вокруг:gen', + 'вплоть_до': 'вплоть_до:gen', + 'вроде': 'вроде:gen', + 'выше': 'выше:gen', + 'для': 'для:gen', + 'для_в': 'для:gen', + 'до_то_как': 'до:gen', # до того, как ... + 'за_исключение': 'за_исключением:gen', + 'из_более_чем': 'из:gen', + 'к': 'к:dat', + 'ко': 'ко:dat', + 'коли_скоро': 'коль_скоро', + 'кроме': 'кроме:gen', + 'между_во_глава': 'между:ins', # annotation error: 'между делегацией Минобороны во главе с замминистра Владимиром Исаковым и лидером Приднестровья Игорем Смирновым' + 'на_вперед': 'на:acc', + 'над': 'над:ins', # at least I have not encountered any genuine example of accusative + 'насчет': 'насчет:gen', + 'несмотря_на': 'несмотря_на:acc', + 'ниже': 'ниже:gen', + 'около': 'около:gen', + 'от_до': 'от:gen', + 'от_от': 'от:gen', + 'от_с': 'от:gen', + 'относительно': 'относительно:gen', + 'перед': 'перед:ins', + 'по_мера': 'по_мере:gen', + 'по_мера_то_как': 'по_мере_того_как', + 'по_отношение_ко?': 'по_отношению_к:dat', + 'по_повод': 'по_поводу:gen', + 'по_сравнение_с': 'по_сравнению_с:ins', + 'помимо': 'помимо:gen', + 'порядка': 'порядка:gen', + 'после': 'после:gen', + 'посредством_как': 'посредством:gen', + 'при': 'при:loc', + 'при_помощь': 'при_помощи:gen', + 'при_условие_что': 'при_условии_что', + 'про': 'про:acc', + 'против': 'против:gen', + 'с_более_чем': 'с:gen', + 'с_во_глава': 'с:ins', + 'с_на': 'с:par', + 'с_помощь': 'с_помощью:gen', + 'с_тем': 'с:ins', + 'с_тот_пора_как': 'с_тех_пор_как', + 'с_что': 'с:ins', + 'свыше': 'свыше:gen', + 'со_сторона': 'со_стороны:gen', + 'согласно': 'согласно:dat', + 'спустя': 'спустя:acc', + 'среди': 'среди:gen', + 'среди_в': 'среди:gen', + 'так_чтобы': 'чтобы', + 'тем_между': 'между:ins', + 'у': 'у:gen', + 'у_без': 'у:gen', + 'через': 'через:acc', + 'чтоб': 'чтобы' + } + + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Russian basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + # Although in theory allowed by the EUD guidelines, Russian does not enhance the ccomp relation with case markers. + edep['deprel'] = re.sub(r'^ccomp:чтобы$', r'ccomp', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + # If the marker is 'быть', discard it. It represents the phrase 'то есть', which should not be analyzed as introducing a subordinate clause. + edep['deprel'] = re.sub(r':(быть|сколь|столько|типа).*', '', edep['deprel']) + # Some markers should be discarded only if they occur as clause markers (acl, advcl). + edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) + # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). + edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'obl:\1\2', edep['deprel']) + edep['deprel'] = re.sub(r'^acl(?::relcl)?:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'nmod:\1\2', edep['deprel']) + # If the case marker starts with 'столько', remove this part. + # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. + # Similarly, 'то' occurs in 'то...то' and should be removed. + edep['deprel'] = re.sub(r':(столько|то|точно)[_:]', ':', edep['deprel']) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'словно_у' becomes just 'словно'. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) + if m and m.group(2) and not x+m.group(2) in exceptions: + edep['deprel'] = m.group(1)+':'+x + solved = True + break + if solved: + continue + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|par|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + if solved: + continue + # The following prepositions have more than one morphological case + # available. + m = re.match(r'^(obl(?::arg)?|nmod):(до|из|от)(?::(?:nom|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or partitive are possible. Pick genitive. + edep['deprel'] = m.group(1)+':'+m.group(2)+':gen' + continue + # Both "на" and "в" also occur with genitive. However, this + # is only because there are numerals in the phrase ("в 9 случаев из 10") + # and the whole phrase should not be analyzed as genitive. + m = re.match(r'^(obl(?::arg)?|nmod):(в|во|на|о)(?::(?:nom|gen|dat|voc|ins))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Accusative or locative are possible. Pick locative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' + continue + # Unlike in Czech, 'над' seems to allow only instrumental and not accusative. + m = re.match(r'^(obl(?::arg)?|nmod):(за|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Accusative or instrumental are possible. Pick accusative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' + continue + m = re.match(r'^(obl(?::arg)?|nmod):(между)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or instrumental are possible. Pick genitive. + edep['deprel'] = m.group(1)+':'+m.group(2)+':gen' + continue + m = re.match(r'^(obl(?::arg)?|nmod):(по)(?::(?:nom|gen|voc|ins))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Dative, accusative or locative are possible. Pick dative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':dat' + continue + m = re.match(r'^(obl(?::arg)?|nmod):(с)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or instrumental are possible. Pick instrumental. + edep['deprel'] = m.group(1)+':'+m.group(2)+':ins' + continue + if re.match(r'^(nmod|obl):', edep['deprel']): + if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': + # This is a same-case noun-noun modifier, which just happens to be in the locative. + # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has + # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. + edep['deprel'] = 'nmod' + elif edep['deprel'] == 'nmod:loc': + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'nmod:voc': + edep['deprel'] = 'nmod:nom' + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/ru/fixtoest.py b/udapi/block/ud/ru/fixtoest.py new file mode 100644 index 00000000..1b603e96 --- /dev/null +++ b/udapi/block/ud/ru/fixtoest.py @@ -0,0 +1,35 @@ +"""Block to fix annotation of то есть in Russian.""" +from udapi.core.block import Block +import logging +import re + +class FixToEst(Block): + + def process_node(self, node): + """ + In the converted data from Kira, the fixed expression "то есть" ("that is") + is treated as a subordinator and attached as "mark", which later makes it + part of complex enhanced relation labels. I believe that this analysis is + wrong and that it will be better to label these expressions as "cc". + """ + if node.udeprel == 'mark' and node.lemma == 'то': + if len([c for c in node.children if c.udeprel == 'fixed' and c.lemma == 'быть']) > 0: + self.set_basic_and_enhanced(node, node.parent, 'cc', 'cc') + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/setspaceafterfromtext.py b/udapi/block/ud/setspaceafterfromtext.py index c5321221..ec7ab658 100644 --- a/udapi/block/ud/setspaceafterfromtext.py +++ b/udapi/block/ud/setspaceafterfromtext.py @@ -14,6 +14,10 @@ class SetSpaceAfterFromText(Block): """Block for setting of the SpaceAfter=No MISC attribute according to the sentence text.""" def process_tree(self, root): + # Empty nodes cannot have 'SpaceAfter=No', so make sure the file is valid. + for empty_node in root.empty_nodes: + del empty_node.misc['SpaceAfter'] + text = root.text if text is None: raise ValueError('Tree %s has no text, cannot use ud.SetSpaceAfterFromText' % root) diff --git a/udapi/block/ud/settranslation.py b/udapi/block/ud/settranslation.py new file mode 100644 index 00000000..487cca06 --- /dev/null +++ b/udapi/block/ud/settranslation.py @@ -0,0 +1,59 @@ +""" +Block SetTranslation for setting of sentence-level translation (the attribute +text_en for English translation) from a separate text file (one sentence per +line). For example, one can export the original sentences using write.SentencesHtml, +then Google-translate them in the web browser, then CTRL+C CTRL+V to a plain +text editor, save them as translations.txt and import them using this block. + +Usage: +udapy -s ud.SetTranslation file=translations.txt < in.conllu > out.conllu + +Author: Dan Zeman +""" +from udapi.core.block import Block +import re +import logging + +class SetTranslation(Block): + """ + Set text_en to the next available translation. + """ + + def __init__(self, file, overwrite=False, **kwargs): + """ + Create the SetTranslation block. + + Parameters: + file: the name of the text file with the translations (one sentence per line) + overwrite=1: set the translation even if the sentence already has one + (default: do not overwrite existing translations) + """ + super().__init__(**kwargs) + self.file = file + fh = open(self.file, 'r', encoding='utf-8') + self.trlines = fh.readlines() + self.nlines = len(self.trlines) + self.iline = 0 + self.overwrite = overwrite + + def process_tree(self, tree): + if self.iline < self.nlines: + translation = self.trlines[self.iline] + self.iline += 1 + comments = [] + if tree.comment: + comments = tree.comment.split('\n') + i_tr = -1 + for i in range(len(comments)): + # The initial '#' character has been stripped. + if re.match(r'\s*text_en\s*=', comments[i]): + i_tr = i + break + if i_tr >= 0: + if self.overwrite: + comments[i_tr] = ' text_en = ' + translation + else: + comments.append(' text_en = ' + translation) + tree.comment = '\n'.join(comments) + elif self.iline == self.nlines: + logging.warning('There are only %d translation lines but there are more input sentences.' % self.nlines) diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py new file mode 100644 index 00000000..7208b6ef --- /dev/null +++ b/udapi/block/ud/sk/fixedeprels.py @@ -0,0 +1,137 @@ +"""Block to fix case-enhanced dependency relations in Slovak.""" +from udapi.core.block import Block +import logging +import re + +class FixEdeprels(Block): + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'a_hoci': 'hoci', + 'ako': 'ako', # remove morphological case + 'ako_na': 'ako', + 'akoby_z': 'z:gen', + 'akže': 'ak', + 'ani_keby': 'keby', + 'až_keď': 'keď', + 'do': 'do:gen', + 'k': 'k:dat', + 'kto': 'kým', ###!!! The lemma should be fixed! The pronoun has grammaticalized as a subordinator. + 'mimo': 'mimo:gen', + 'na_rozdiel_od': 'na_rozdiel_od:gen', + 'na_základ': 'na_základe:gen', + 'od': 'od:gen', + 'pod_vplyv': 'pod_vplyvom:gen', + 'pomoc': 'pomocou:gen', + 'pre': 'pre:acc', + 'prostredníctvom': 'prostredníctvom:gen', + 'prv_ako': 'ako', + 's': 's:ins', + 's_cieľ': 's_cieľom', # no case, used with infinitives (advcl) + 's_dôraz_na': 's_dôrazom_na:acc', + 's_ohľad_na': 's_ohľadom_na:acc', + 's_pomoc': 's_pomocou:gen', + 'smer_k': 'smerom_k:dat', + 'spoločne_s': 'spoločne_s:ins', + 'spolu_s': 'spolu_s:ins', + 'v_dôsledok': 'v_dôsledku:gen', + 'v_meno': 'v_mene:gen', + 'v_oblasť': 'v_oblasti:gen', + 'v_porovnanie_s': 'v_porovnaní_s:ins', + 'v_porovnaniu_s': 'v_porovnaní_s:ins', + 'v_priebeh': 'v_priebehu:gen', + 'v_prípad': 'v_prípade:gen', + 'v_prospech': 'v_prospech:gen', + 'v_rámec': 'v_rámci:gen', + 'v_spolupráca_s': 'v_spolupráci_s:ins', + 'v_súlad_s': 'v_súlade_s:ins', + 'v_súvislosť_s': 'v_súvislosti_s:ins', + 'v_ústrety': 'v_ústrety:dat', + 'v_vzťah_k': 'vo_vzťahu_k:dat', + 'v_závislosť_na': 'v_závislosti_na:loc', + 'vzhľad_na': 'vzhľadom_na:acc', + 'z': 'z:gen', + 'z_hľadisko': 'z_hľadiska:gen', + 'začiatkom': 'začiatkom:gen' + } + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Slovak basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. + if not solved: + m = re.match(r'^(obl(?::arg)?|nmod):(medzi|na|o|po|pred|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True + # If we failed to identify the case of the preposition in the + # preceding steps, pick a default. It applies mostly to 'o' + # with wrongly split time values. + if not solved: + m = re.match(r'^(obl(?::arg)?|nmod):o$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':o:acc' + solved = True + m = re.match(r'^(obl(?::arg)?|nmod):(po|v)$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' + solved = True + # Some cases do not occur with nominal modifiers without preposition. + # If we see them, chances are that it is the same-case modifier, + # and the same case just happens to be the one we see. For vocatives, + # it is also possible that they have been confused with nominatives. + if not solved: + m = re.match(r'^(obl(?::arg)?|nmod):(voc|loc)$', edep['deprel']) + if m: + edep['deprel'] = m.group(1) + solved = True + # Annotation and conversion errors. + if not solved: + # Povedal som jej „na zdorovie“. + if edep['deprel'] == 'obl:arg:na' and node.form == 'zdorovie': + self.set_basic_and_enhanced(node, edep['parent'], 'ccomp', 'ccomp') + solved = True + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/splittoken.py b/udapi/block/ud/splittoken.py new file mode 100644 index 00000000..16c60a38 --- /dev/null +++ b/udapi/block/ud/splittoken.py @@ -0,0 +1,107 @@ +""" +Block ud.SplitToken will split a given token into multiple tokens. +""" +from udapi.core.block import Block +import re +import logging + + +class SplitToken(Block): + """ + Split a token into two or more. A MISC attribute is used to mark the tokens + that should be split. (The attribute may have been set by an annotator or + by a previous block that tests the specific conditions under which splitting + is desired.) Multiword tokens are currently not supported: The node to be + split cannot belong to a MWT. Note that the result will not be a MWT either + (use the block ud.AddMwt if that is desired). There will be simply a new + attribute SpaceAfter=No, possibly accompanied by CorrectSpaceAfter=Yes + (indicating that this was an error in the source text). + """ + + def __init__(self, misc_name='SplitToken', **kwargs): + """ + Args: + misc_name: name of the MISC attribute that can trigger the splitting + default: SplitToken + The value of the attribute should indicate where to split the token. + It should be a string that is identical to node.form except that + there is one or more spaces where the token should be split. + """ + super().__init__(**kwargs) + self.misc_name = misc_name + + def process_node(self, node): + """ + The SplitToken (or equivalent) attribute in MISC will trigger action. + Either the current node will be split to multiple nodes and the + attribute will be removed from MISC, or a warning will be issued that + the splitting cannot be done and the attribute will stay in MISC. Note + that multiword token lines and empty nodes are not even scanned for + the attribute, so if it is there, it will stay there but no warning + will be printed. + """ + value = node.misc[self.misc_name] + if value == '': + return + if node.multiword_token: + logging.warning(f"MISC {self.misc_name} cannot be used if the node belongs to a multiword token.") + node.misc['Bug'] = 'SplittingTokenNotSupportedHere' + return + ###!!! This block currently must not be applied on data containing + ###!!! enhanced dependencies. We must first implement adjustments of + ###!!! the enhanced structure. + if node.deps: + logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') + # Verify that the value of the MISC attribute can be used as specification + # of the split. + if re.match(r'^\s', value) or re.search(r'\s$', value) or re.search(r'\s\s', value): + logging.warning(f"MISC {self.misc_name} is '{value}'; leading spaces, trailing spaces or multiple consecutive spaces are not allowed.") + node.misc['Bug'] = f'{self.misc_name}BadValue' + return + if re.search(r'\s', node.form): + logging.warning(f"MISC {self.misc_name} cannot be used with nodes whose forms contain a space (here '{node.form}').") + node.misc['Bug'] = 'SplittingTokenNotSupportedHere' + return + if re.sub(r' ', '', value) != node.form: + logging.warning(f"MISC {self.misc_name} value '{value}' does not match the word form '{node.form}'.") + node.misc['Bug'] = f'{self.misc_name}BadValue' + return + # Do the split. + space_after = node.misc['SpaceAfter'] + forms = value.split(' ') + # Optionally, SplitTokenMorpho in MISC can have the morphological annotation + # of the new tokens. For example: + # SplitTokenMorpho=LEMMA=popisovat\tUPOS=VERB\tFEATS=Aspect=Imp\\pMood=Ind\\pNumber=Sing\\pPerson=3\\pPolarity=Pos\\pTense=Pres\\pVerbForm=Fin\\pVoice=Act + if node.misc['SplitTokenMorpho'] != '': + morphoblocks = [''] + node.misc['SplitTokenMorpho'].split(' ') + del node.misc['SplitTokenMorpho'] + else: + morphoblocks = ['' for x in forms] + node.form = forms[0] + last_node = node + for form, morpho in zip(forms[1:], morphoblocks[1:]): + last_node.misc['SpaceAfter'] = 'No' + last_node.misc['CorrectSpaceAfter'] = 'Yes' + lemma = form + upos = node.upos + feats = str(node.feats) + xpos = node.xpos + if morpho != '': + cols = morpho.split('\\t') + for c in cols: + colname, value = c.split('=', 1) + if colname == 'LEMMA': + lemma = value + elif colname == 'UPOS': + upos = value + elif colname == 'FEATS': + feats = re.sub(r'\\p', '|', value) + elif colname == 'XPOS': + xpos = value + else: + logging.fatal(f"c = {c}") + new_node = node.create_child(form=form, lemma=lemma, upos=upos, feats=feats, xpos=xpos, deprel='dep') + new_node.shift_after_node(last_node) + last_node = new_node + last_node.misc['SpaceAfter'] = space_after + del node.misc[self.misc_name] diff --git a/udapi/block/ud/splitunderscoretokens.py b/udapi/block/ud/splitunderscoretokens.py index 094f181a..44575e0c 100644 --- a/udapi/block/ud/splitunderscoretokens.py +++ b/udapi/block/ud/splitunderscoretokens.py @@ -23,7 +23,7 @@ class SplitUnderscoreTokens(Block): Real-world use cases: UD_Irish (`default_deprel=fixed`) and UD_Czech-CLTT v1.4. """ - def __init__(self, deprel=None, default_deprel='flat', **kwargs): + def __init__(self, deprel=None, default_deprel='flat', lemma='split', **kwargs): """Create the SplitUnderscoreTokens block instance. Args: @@ -31,14 +31,21 @@ def __init__(self, deprel=None, default_deprel='flat', **kwargs): Most common values are: flat, fixed, compound. Default=None. default_deprel: Which deprel to use for the newly created nodes if the heuristics in `deprel_for()` method fail. Default=flat. + lemma: What to do with the lemmas? + - 'split' (the default) means to split them on underscores as well + (and warn in case of a different number of underscores than in the form). + - 'form' means to copy the forms to the lemmas """ super().__init__(**kwargs) self.deprel = deprel self.default_deprel = default_deprel + self.lemma = lemma def process_node(self, node): if node.form != '_' and '_' in node.form: forms = node.form.split('_') + if self.lemma == 'form': + node.lemma = node.form lemmas = node.lemma.split('_') if len(forms) != len(lemmas): logging.warning("Different number of underscores in %s and %s, skipping.", diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py new file mode 100644 index 00000000..952644f8 --- /dev/null +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -0,0 +1,46 @@ +"""Block to convert spurious auxiliaries to lexical verbs in Uyghur.""" +from udapi.core.block import Block +import logging +import re + +class FixSpuriousAux(Block): + + def process_node(self, node): + """ + Some verbs that are called auxiliary by the traditional grammar, should + be analyzed in UD as VERB + non-finite xcomp. + """ + # Sometimes there is a double error: it should not be auxiliary, it is + # attached as aux but it is not tagged AUX. So we only look at the deprel. + if node.udeprel == 'aux': + # بەر/بار = give (used with actions done for the benefit of somebody) + # چىق = go out + # چىقىش = come out + # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) + # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) + # باق = do ever? + # ئۆت = pass + # كۆرۈش = see + # باشلى = start + # يەت = be enough + # قايت = return + # چۈش = fall down + # قىل = do + # چاپ = jump + # قورق = fear + # كەلتۈر = cause + # كىر = enter + # _ ... some putative auxiliaries do not even have a lemma + if re.match(r'^(بەر|بار|چىق|چىقىش|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش|قىل|چاپ|قورق|كەلتۈر|كىر)$', node.lemma): + node.upos = 'VERB' + # The auxiliary inherits the incoming relation of its original parent. + lexverb = node.parent + node.parent = lexverb.parent + node.deprel = lexverb.deprel + # The auxiliary also inherits some but not all children of the lexical verb. + for c in lexverb.children: + if re.match(r'^(nsubj|csubj|obl|advmod|advcl|vocative|discourse|parataxis|punct)$', c.udeprel): + c.parent = node + # The lexical verb becomes an xcomp of the auxiliary. + lexverb.parent = node + lexverb.deprel = 'xcomp' diff --git a/udapi/block/ud/yue/lemmatize.py b/udapi/block/ud/yue/lemmatize.py new file mode 100644 index 00000000..87279dc1 --- /dev/null +++ b/udapi/block/ud/yue/lemmatize.py @@ -0,0 +1,43 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + # dictionary: form --> lemma + lemma = { + '𡃁仔': '笭仔', + '仲': '重', + '企': '徛', + '係咪': '係', + '出嚟': '出唻', + '可': '可以', + '啦': '喇', + '㗎喇': '㗎嘑', + '喇': '嘑', + '嚟': '唻', + '就嚟': '就唻', + '死𡃁妹': '死笭妹', + '老豆': '老頭', + '蚊': '緡', + '蛋撻': '蛋澾', + '返嚟': '返唻', + '過嚟人': '過唻人', + '過嚟': '過唻' + } + + def process_node(self, node): + """ + Parts of the Cantonese treebank lack lemmas. Fortunately, lemmatization + of Sino-Tibetan languages is pretty straightforward most of the time, + as the lemma typically equals to the actual word form. + + For Cantonese, lemmatization includes normalization of some characters. + These are the few cases where lemma differs from the surface form. + """ + if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': + if node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py new file mode 100644 index 00000000..abacf29f --- /dev/null +++ b/udapi/block/ud/zh/lemmatize.py @@ -0,0 +1,81 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + def __init__(self, rewrite='empty', **kwargs): + """ + Create the ud.zh.Lemmatize block instance. + + Args: + rewrite=empty: set the lemma if it was empty so far; do not touch the rest + rewrite=form: set the lemma if it was empty or equal to form; do not touch the rest + rewrite=all: set the lemma regardless of what it was previously + """ + super().__init__(**kwargs) + if not re.match(r'^(empty|form|all)$', rewrite): + raise ValueError("Unexpected value of parameter 'rewrite'") + self.rewrite = rewrite + + # dictionary: form --> lemma + lemma = { + # The plural suffix -men. + '我們': '我', # trad + '我们': '我', # simp + '他們': '他', # trad + '他们': '他', # simp + '它們': '它', # trad + '它们': '它', # simp + '牠們': '牠', # trad + '她們': '她', # trad + '她们': '她', # simp + '人們': '人', # trad + '人们': '人' # simp + } + + def process_node(self, node): + """ + Parts of the Chinese treebanks lack lemmas. Fortunately, lemmatization + of Sino-Tibetan languages is pretty straightforward most of the time, + as the lemma typically equals to the actual word form. + """ + if self.rewrite == 'empty' and not (node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): + return + elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): + return + # Lemmatize negated verbs to their affirmative forms. + # 不是 bùshì = not be + # 沒有 没有 méiyǒu = not exist + # 沒能 没能 méinéng = cannot + # 未能 wèinéng = cannot + # Lemmatize question verbs to their base forms. + # 要不要 yàobùyào = do (you) want? + # 有没有 yǒuméiyǒu = do (you) have? + # Verbs that are derived from the copula and tagged as the copula need + # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). + # 亦為 亦为 yìwèi = také + # 則為 则为 zéwèi = potom + # 更為 更为 gèngwèi = více + # 認為 认为 rènwéi = myslet, věřit + # 以為 以为 yǐwéi = myslet, věřit + # 以爲 以为 yǐwéi = myslet, věřit + if re.match(r'^(AUX|VERB)$', node.upos): + m1 = re.match(r'^([不没沒未])(.+)$', node.form) + m2 = re.match(r'^(.+)([不没沒未])\1$', node.form) + m3 = re.search(r'([是爲為为])', node.form) + if m1: + node.lemma = m1.group(2) + node.feats['Polarity'] = 'Neg' + elif m2: + node.lemma = m2.group(1) + node.feats['Mood'] = 'Int' + elif m3: + node.lemma = m3.group(1) + if node.lemma == '爲': + node.lemma = '為' + elif node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py index 3ec4a131..069fc9fb 100644 --- a/udapi/block/udpipe/base.py +++ b/udapi/block/udpipe/base.py @@ -1,6 +1,7 @@ """Block udpipe.Base for tagging and parsing using UDPipe.""" from udapi.core.block import Block from udapi.tool.udpipe import UDPipe +from udapi.tool.udpipeonline import UDPipeOnline from udapi.core.bundle import Bundle KNOWN_MODELS = { @@ -118,13 +119,14 @@ class Base(Block): """Base class for all UDPipe blocks.""" # pylint: disable=too-many-arguments - def __init__(self, model=None, model_alias=None, - tokenize=True, tag=True, parse=True, resegment=False, **kwargs): - """Create the udpipe.En block object.""" + def __init__(self, model=None, model_alias=None, online=False, + tokenize=True, tag=True, parse=True, resegment=False, + ranges=False, delete_nodes=False, **kwargs): super().__init__(**kwargs) - self.model, self.model_alias = model, model_alias + self.model, self.model_alias, self.online = model, model_alias, online self._tool = None self.tokenize, self.tag, self.parse, self.resegment = tokenize, tag, parse, resegment + self.ranges, self.delete_nodes = ranges, delete_nodes @property def tool(self): @@ -134,44 +136,56 @@ def tool(self): if not self.model: if not self.model_alias: raise ValueError('model (path/to/model) or model_alias (e.g. en) must be set!') - self.model = KNOWN_MODELS[self.model_alias] - self._tool = UDPipe(model=self.model) + if self.online: + self.model = self.model_alias + else: + self.model = KNOWN_MODELS[self.model_alias] + if self.online: + self._tool = UDPipeOnline(model=self.model) + else: + self._tool = UDPipe(model=self.model) return self._tool def process_document(self, doc): - tok, tag, par = self.tokenize, self.tag, self.parse + tok, tag, par, reseg, ranges = self.tokenize, self.tag, self.parse, self.resegment, self.ranges + if self.zones == "all" and self.online: + self.tool.process_document(doc, tok, tag, par, reseg, ranges) + return old_bundles = doc.bundles new_bundles = [] for bundle in old_bundles: for tree in bundle: new_bundles.append(bundle) if self._should_process_tree(tree): + if self.delete_nodes: + for subroot in tree.children: + subroot.remove() if tok: - new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=self.resegment, - tag=self.tag, parse=self.parse) + new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=reseg, + tag=tag, parse=par, ranges=ranges) if self.resegment and len(new_trees) > 1: orig_bundle_id = bundle.bundle_id bundle.bundle_id = orig_bundle_id + '-1' for i, new_tree in enumerate(new_trees[1:], 2): - new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i)) + new_bundle = Bundle(document=doc, bundle_id=f"{orig_bundle_id}-{i}") new_tree.zone = tree.zone new_bundle.add_tree(new_tree) new_bundles.append(new_bundle) - elif not tok and tag and par: - self.tool.tag_parse_tree(tree) - elif not tok and not tag and not par and self.resegment: + elif not tok and not reseg and (tag or par): + self.tool.tag_parse_tree(tree, tag=tag, parse=par) + elif not tok and reseg and not tag and not par: sentences = self.tool.segment_text(tree.text) if len(sentences) > 1: orig_bundle_id = bundle.bundle_id bundle.bundle_id = orig_bundle_id + '-1' tree.text = sentences[0] for i, sentence in enumerate(sentences[1:], 2): - new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i)) + new_bundle = Bundle(document=doc, bundle_id=f"{orig_bundle_id}-{i}") new_tree = new_bundle.create_tree(zone=tree.zone) new_tree.text = sentence new_bundles.append(new_bundle) else: - raise ValueError("Unimplemented tokenize=%s tag=%s parse=%s" % (tok, tag, par)) + raise ValueError(f"Unimplemented tokenize={tok} tag={tag} parse={par} resegment={reseg}") doc.bundles = new_bundles ''' diff --git a/udapi/block/util/eval.py b/udapi/block/util/eval.py index b814b80d..6e4f2ac9 100644 --- a/udapi/block/util/eval.py +++ b/udapi/block/util/eval.py @@ -29,7 +29,8 @@ class Eval(Block): # pylint: disable=too-many-arguments,too-many-instance-attributes def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end=None, before_doc=None, after_doc=None, before_bundle=None, after_bundle=None, - expand_code=True, **kwargs): + coref_mention=None, coref_entity=None, empty_nodes=False, + expand_code=True, mwt=None, **kwargs): super().__init__(**kwargs) self.doc = doc self.bundle = bundle @@ -37,10 +38,14 @@ def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end= self.node = node self.start = start self.end = end + self.mwt = mwt self.before_doc = before_doc self.after_doc = after_doc self.before_bundle = before_bundle self.after_bundle = after_bundle + self.coref_mention = coref_mention + self.coref_entity = coref_entity + self.empty_nodes = empty_nodes self.expand_code = expand_code self.count = collections.Counter() @@ -66,11 +71,21 @@ def process_document(self, document): if self.doc: exec(self.expand_eval_code(self.doc)) - if self.bundle or self.before_bundle or self.after_bundle or self.tree or self.node: + if self.bundle or self.before_bundle or self.after_bundle or self.tree or self.node or self.mwt: for bundle in doc.bundles: # TODO if self._should_process_bundle(bundle): self.process_bundle(bundle) + if self.coref_entity or self.coref_mention: + for entity in doc.coref_entities: + if self.coref_entity: + this = entity + exec(self.expand_eval_code(self.coref_entity)) + if self.coref_mention: + for mention in entity.mentions: + this = mention + exec(self.expand_eval_code(self.coref_mention)) + def process_bundle(self, bundle): # Extract variables, so they can be used in eval code document = doc = bundle.document @@ -82,7 +97,7 @@ def process_bundle(self, bundle): if self.bundle: exec(self.expand_eval_code(self.bundle)) - if self.tree or self.node: + if self.tree or self.node or self.mwt: trees = bundle.trees for tree in trees: if self._should_process_tree(tree): @@ -102,10 +117,16 @@ def process_tree(self, tree): exec(self.expand_eval_code(self.tree)) if self.node: - for node in tree.descendants(): + nodes = tree.descendants_and_empty if self.empty_nodes else tree.descendants + for node in nodes: this = node exec(self.expand_eval_code(self.node)) + if self.mwt: + for mwt in tree.multiword_tokens: + this = mwt + exec(self.expand_eval_code(self.mwt)) + def process_start(self): if self.start: exec(self.expand_eval_code(self.start)) diff --git a/udapi/block/util/findbug.py b/udapi/block/util/findbug.py index e05afe76..e1ea838c 100644 --- a/udapi/block/util/findbug.py +++ b/udapi/block/util/findbug.py @@ -5,9 +5,12 @@ insert "util.FindBug block=" into the scenario, e.g. to debug ``second.Block``, use -udapy first.Block util.FindBug block=second.Block > bug.conllu + udapy first.Block util.FindBug block=second.Block > bug.conllu This will create the file bug.conllu with the bundle, which caused the bug. + +The second.Block can have any parameters, e.g. + udapy first.Block util.FindBug block=second.Block param1=value1 param2=value2 > bug.conllu """ import copy import logging @@ -20,24 +23,31 @@ class FindBug(BaseWriter): """Debug another block by finding a minimal testcase conllu file.""" - def __init__(self, block, first_error_only=True, **kwargs): - """Args: block, first_error_only""" - super().__init__(**kwargs) + def __init__(self, block, first_error_only=True, + files='-', filehandle=None, docname_as_file=False, encoding='utf-8', + newline='\n', overwrite=False, + **kwargs): + """Args: block, first_error_only. + All other parameters (which are not parameters of BaseWriter) + will be passed to the block being inspected. + """ + super().__init__(files, filehandle, docname_as_file, encoding, newline, overwrite) self.block = block self.first_error_only = first_error_only + self._kwargs = kwargs def process_document(self, document): sub_path, class_name = _parse_block_name(self.block) module = "udapi.block." + sub_path + "." + class_name.lower() try: - command = "from " + module + " import " + class_name + " as b" + command = "from " + module + " import " + class_name + " as B" logging.debug("Trying to run command: %s", command) exec(command) # pylint: disable=exec-used except Exception: logging.warning("Error when trying import the block %s", self.block) raise - command = "b()" # TODO params as kwargs + command = "B(**self._kwargs)" logging.debug("Trying to evaluate this: %s", command) new_block = eval(command) # pylint: disable=eval-used diff --git a/udapi/block/util/joinsentence.py b/udapi/block/util/joinsentence.py new file mode 100644 index 00000000..578f3865 --- /dev/null +++ b/udapi/block/util/joinsentence.py @@ -0,0 +1,77 @@ +""" +Block util.JoinSentence will join a given sentence with the preceding one. +""" +import logging +from udapi.core.block import Block + +class JoinSentence(Block): + """ + Joins a sentence with the preceding one. There are two ways how to indicate + the sentences that this block should process. + + Method 1: Parameter sent_id provides the id of the sentence that should be + merged with the preceding one. At most one sentence pair from the input will + be merged, even if there are multiple sentences with the given id. + + Method 2: A MISC attribute can be specified that, if found, will trigger + joining of the current sentence to the previous one. With this approach, + multiple sentence pairs can be merged during one run. + """ + + def __init__(self, sent_id=None, misc_name=None, misc_value=None, **kwargs): + """ + Args: + sent_id: which sentence should be appended to the previous one + misc_name: name of the MISC attribute that can trigger the joining (cannot be combined with sent_id and word_id) + misc_value: value of the MISC attribute to trigger the joining; if not specified, then simple occurrence of the attribute with any value will cause the joining + MISC attributes that have triggered sentence joining will be removed from their node. + """ + super().__init__(**kwargs) + if misc_name: + if sent_id: + logging.fatal('Cannot combine misc_value with sent_id') + else: + if not sent_id: + logging.fatal('Missing parameter sent_id') + self.sent_id = sent_id + self.misc_name = misc_name + self.misc_value = misc_value + + def process_document(self, document): + previous_tree = None + for bundle_no, bundle in enumerate(document.bundles): + # In general, a bundle may contain multiple trees in different zones. + # In UD data, we always expect just one zone (labeled '') per bundle. + # This code could be extended to join all zones but we do not try to do it at present. + if len(bundle.trees) != 1: + logging.fatal('Cannot process bundles that have less or more than 1 zone') + if not bundle.has_tree(zone=''): + logging.fatal('Cannot process bundles that do not have the zone with empty zone id') + if self.misc_name: + root = bundle.get_tree() + # The MISC attribute we are looking for should logically occur + # on the first node of the sentence but we can take it from any node. + join_commands = [n for n in root.descendants if n.misc[self.misc_name] and self.misc_value == None or n.misc[self.misc_name] == self.misc_value] + if join_commands: + if not previous_tree: + logging.fatal('Cannot join the first sentence as there is no previous sentence') + previous_tree.steal_nodes(root.descendants) + previous_tree.text = previous_tree.compute_text() + # Remove from the node the MISC attribute that triggered the sentence split. + for n in join_commands: + n.misc[self.misc_name] = '' + # Remove the current bundle. It will also update the numbers of the remaining bundles. + bundle.remove() + else: + previous_tree = root + elif bundle.bundle_id == self.sent_id: + logging.info('Found!') + if not previous_tree: + logging.fatal('Cannot join the first sentence as there is no previous sentence') + root = bundle.get_tree() + previous_tree.steal_nodes(root.descendants) + previous_tree.text = previous_tree.compute_text() + # Remove the current bundle. It will also update the numbers of the remaining bundles. + bundle.remove() + # We have found our sentence. No need to process the rest of the document. + break diff --git a/udapi/block/util/mark.py b/udapi/block/util/mark.py index c57f7443..bcb4f894 100644 --- a/udapi/block/util/mark.py +++ b/udapi/block/util/mark.py @@ -15,7 +15,7 @@ class Mark(Block): udapy -TM util.Mark node='node.is_nonprojective()' < in | less -R """ - def __init__(self, node, mark=1, add=True, **kwargs): + def __init__(self, node, mark=1, mark_attr="Mark", add=True, print_stats=False, empty=False, **kwargs): """Create the Mark block object. Args: @@ -24,17 +24,36 @@ def __init__(self, node, mark=1, add=True, **kwargs): `mark`: the node will be marked with `Mark=` in `node.misc`. Default=1. + `mark_attr`: use this MISC attribute name instead of "Mark". + `add`: should we keep existing Mark|ToDo|Bug? Default=True. + + `print_stats`: print the total number of marked nodes to stdout at process_end + + `empty`: apply the code also on empty nodes """ super().__init__(**kwargs) self.mark = mark + self.mark_attr = mark_attr self.node = node self.add = add + self.print_stats = print_stats + self._marked = 0 + self.empty = empty def process_node(self, node): if eval(self.node): - node.misc['Mark'] = self.mark + node.misc[self.mark_attr] = self.mark + self._marked += 1 elif not self.add: - del node.misc['Mark'] + del node.misc[self.mark_attr] del node.misc['ToDo'] del node.misc['Bug'] + + def process_empty_node(self, empty_node): + if self.empty: + self.process_node(empty_node) + + def process_end(self): + if self.print_stats: + print(f'util.Mark marked {self._marked} nodes') diff --git a/udapi/block/util/markdiff.py b/udapi/block/util/markdiff.py index 22a7a03e..e102ca9c 100644 --- a/udapi/block/util/markdiff.py +++ b/udapi/block/util/markdiff.py @@ -1,5 +1,7 @@ """util.MarkDiff is a special block for marking differences between parallel trees.""" +import collections import difflib +import pprint from udapi.core.block import Block @@ -7,13 +9,43 @@ class MarkDiff(Block): """Mark differences between parallel trees.""" def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc', - mark=1, add=False, **kwargs): - """Create the Mark block object.""" + mark=1, mark_attr='Mark', add=False, print_stats=0, ignore_parent=False, + align=False, align_attr='Align', **kwargs): + """Create the Mark block object. + Params: + gold_zone: Which of the zones should be treated as gold? + (The changes are interpreted as from a "pred"=predicted zone into the gold zone.) + attributes: Which node attributes should be considered when searching for diffs? + The tree topology, i.e. node parent is always considered. + mark: What value should be used in `node.misc['Mark']` of the differing nodes? + mark_attr: use this MISC attribute name instead of "Mark". + Use mark_attr=0 to prevent marking diffs in MISC. + add: If False, node.misc attributes Mark, ToDo and Bug will be deleted before running this block, + so that the marked_only option (e.g. via `udapy -TM`) prints only nodes marked by this block. + print_stats: How many lines of statistics should be printed? -1 means all. + ignore_parent: ignore differences in dependency parents + align: store word alignment, possible values are False (no alignment stored, the default) + "from-pred", i.e. pred_node.misc["Align"] = aligned_gold_node.ord, + "from-gold", i.e. gold_node.misc["Align"] = aligned_pred_node.ord and + "both", i.e. both from-pred and from-gold. + If only forms should be considered for inducing the word alignment, + you should use "util.MarkDiff attributes='form' ignore_parent=1 align=1". + Only one-to-one alignment is supported. + align_attr: use this MISC attribute name instead of "Align". + """ super().__init__(**kwargs) self.gold_zone = gold_zone self.attrs = attributes.split(',') self.mark = mark + self.mark_attr = mark_attr self.add = add + self.print_stats = print_stats + self.ignore_parent = ignore_parent + self.align = align + self.align_attr = align_attr + self.stats = collections.Counter() + if not mark_attr and not align and not print_stats: + raise ValueError('mark_attr=0 does not make sense without align or print_stats') def process_tree(self, tree): gold_tree = tree.bundle.get_tree(self.gold_zone) @@ -21,17 +53,17 @@ def process_tree(self, tree): return if not self.add: for node in tree.descendants + gold_tree.descendants: - del node.misc['Mark'] + del node.misc[self.mark_attr] del node.misc['ToDo'] del node.misc['Bug'] pred_nodes, gold_nodes = tree.descendants, gold_tree.descendants # Make sure both pred and gold trees are marked, even if one has just deleted nodes. - if len(pred_nodes) != len(gold_nodes): - tree.add_comment('Mark = %s' % self.mark) - gold_tree.add_comment('Mark = %s' % self.mark) - pred_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in pred_nodes] - gold_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in gold_nodes] + if len(pred_nodes) != len(gold_nodes) and self.mark_attr: + tree.add_comment(f'{self.mark_attr} = {self.mark}') + gold_tree.add_comment(f'{self.mark_attr} = {self.mark}') + pred_tokens = ['_'.join(n.get_attrs(self.attrs, undefs="_")) for n in pred_nodes] + gold_tokens = ['_'.join(n.get_attrs(self.attrs, undefs="_")) for n in gold_nodes] matcher = difflib.SequenceMatcher(None, pred_tokens, gold_tokens, autojunk=False) diffs = list(matcher.get_opcodes()) @@ -41,14 +73,45 @@ def process_tree(self, tree): if edit in {'equal', 'replace'}: for i in range(pred_lo, pred_hi): alignment[i] = i - pred_lo + gold_lo + if self.align in ("both", "from-pred"): + pred_nodes[i].misc[self.align_attr] = i - pred_lo + gold_lo + 1 + if self.align in ("both", "from-gold"): + gold_nodes[i - pred_lo + gold_lo].misc[self.align_attr] = i + 1 for diff in diffs: edit, pred_lo, pred_hi, gold_lo, gold_hi = diff if edit == 'equal': for p_node, g_node in zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi]): - if alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: - p_node.misc['Mark'] = self.mark - g_node.misc['Mark'] = self.mark + if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + self.stats['ONLY-PARENT-CHANGED'] += 1 + if self.mark_attr: + p_node.misc[self.mark_attr] = self.mark + g_node.misc[self.mark_attr] = self.mark else: - for node in pred_nodes[pred_lo:pred_hi] + gold_nodes[gold_lo:gold_hi]: - node.misc['Mark'] = self.mark + if self.mark_attr: + for node in pred_nodes[pred_lo:pred_hi] + gold_nodes[gold_lo:gold_hi]: + node.misc[self.mark_attr] = self.mark + if self.print_stats: + if edit == 'replace': + # first n nodes are treated as aligned, the rest is treated as ADDED/DELETED + n = min(pred_hi - pred_lo, gold_hi - gold_lo) + for p_node, g_node in zip(pred_nodes[pred_lo:pred_lo + n], gold_nodes[gold_lo:gold_lo + n]): + for attr in self.attrs: + p_value, g_value = p_node._get_attr(attr), g_node._get_attr(attr) + if p_value != g_value: + self.stats[f'{attr.upper()}: {p_value} -> {g_value}'] += 1 + if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + self.stats['PARENT-CHANGED'] += 1 + pred_lo, gold_lo = pred_lo + n, gold_lo + n + for node in gold_nodes[gold_lo:gold_hi]: + self.stats['ADD-WORD'] += 1 + self.stats['ADD-LEMMA: ' + node.lemma] += 1 + for node in pred_nodes[pred_lo:pred_hi]: + self.stats['DELETE-WORD'] += 1 + self.stats['DELETE-LEMMA: ' + node.lemma] += 1 + + def process_end(self): + if self.print_stats: + how_many = None if self.print_stats in (-1, '-1') else self.print_stats + for edit, count in self.stats.most_common(how_many): + print(f'{count:4} {edit}') diff --git a/udapi/block/util/markmwtbugsatnodes.py b/udapi/block/util/markmwtbugsatnodes.py new file mode 100644 index 00000000..ebc2ef4e --- /dev/null +++ b/udapi/block/util/markmwtbugsatnodes.py @@ -0,0 +1,25 @@ +"""util.MarkMwtBugsAtNodes copies Bug attributes from MISC of multiword tokens to MISC of member nodes. + Otherwise they will be ignored when write.TextModeTrees marked_only=1 is called.""" + +from udapi.core.block import Block + +class MarkMwtBugsAtNodes(Block): + """ + If a node belongs to a multiword token and the MWT has Bug in MISC, copy + the Bug to the node so that filtering trees with bugs works. + The same bug note will be copied to all nodes in the MWT. + """ + + ###!!! Do we want to do the same thing also with ToDo attributes? + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def process_node(self, node): + if node.multiword_token: + if node.multiword_token.misc['Bug']: + self.bug(node, node.multiword_token.misc['Bug']) diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py new file mode 100644 index 00000000..4cce4ab8 --- /dev/null +++ b/udapi/block/util/normalize.py @@ -0,0 +1,97 @@ +"""util.Normalize normalizes the ordering of various attributes in CoNLL-U.""" +from udapi.core.block import Block +from pathlib import Path + +class Normalize(Block): + """Normalize the ordering of attributes in the FEATS and MISC columns. + + The attribute-value pairs in the FEATS column in CoNLL-U files + must be sorted alphabetically (case-insensitive) according to the guidelines + (https://universaldependencies.org/format.html#morphological-annotation). + The same is highly recommended for the MISC column. + It is useful e.g. for comparing two conllu files with diff. + + Udapi does the sorting automatically, but for speed reasons + only when writing into these attributes. + This block thus just forces deserialization of node.feats and node.misc, + so that the Udapi later sorts the attributes during serialization. + It is a bit more efficient than something like + util.Eval node='node.feats["Number"] = node.feats["Number"]' + or + util.Eval node='node.misc["NonExistentAttribute"] = None' + """ + + def __init__(self, feats=True, misc=True, sent_id=False, empty_node_ord=False, start_sent_id=1, sent_id_prefix="", + sent_id_from_filename=False, sent_id_reset_at_newdoc=False, newdoc_from_filename=False, **kwargs): + """ + Args: + `feats`: normalize the ordering of FEATS. Default=True. + `misc`: normalize the ordering of MISC. Default=True. + `sent_id`: normalize sent_id so it forms a sequence of integers. Default=False. + `empty_node_ord`: normalize ord attributes of empty nodes. Default=False. + `start_sent_id`: the first sent_id number + `sent_id_prefix`: a string to be prepended before the integer sent_id. Default=empty string. + `sent_id_from_filename`: add Path(doc.meta["loaded_from"]).stem before the `sent_id_prefix`. Default=False. + `sent_id_reset_at_newdoc`: reset the sent_id counter to 1 for each new document. Default=False. + `newdoc_from_filename`: set newdoc to Path(doc.meta["loaded_from"]).stem. Default=False. + """ + super().__init__(**kwargs) + self.feats = feats + self.misc = misc + self.sent_id = sent_id + self.empty_node_ord = empty_node_ord + self.next_sent_id = start_sent_id + self.sent_id_prefix = sent_id_prefix + self.sent_id_from_filename = sent_id_from_filename + self.sent_id_reset_at_newdoc = sent_id_reset_at_newdoc + self.newdoc_from_filename = newdoc_from_filename + if sent_id_reset_at_newdoc and not sent_id_from_filename: + raise ValueError("Cannot use sent_id_reset_at_newdoc without sent_id_from_filename") + if sent_id_prefix or start_sent_id != 1 or sent_id_from_filename: + self.sent_id = True + + # TODO: normalize also the order of standardized comments like text, sent_id,... + + def process_bundle(self, bundle): + is_newdoc = any(tree.newdoc for tree in bundle.trees) + if self.newdoc_from_filename and is_newdoc: + tree = next(tree for tree in bundle.trees if tree.newdoc) + tree.newdoc = Path(bundle.document.meta["loaded_from"]).stem + if self.sent_id: + if self.sent_id_reset_at_newdoc and is_newdoc: + self.next_sent_id = 1 + prefix = self.sent_id_prefix + if self.sent_id_from_filename: + prefix = Path(bundle.document.meta["loaded_from"]).stem + prefix + bundle.bundle_id = prefix + str(self.next_sent_id) + self.next_sent_id += 1 + + for tree in bundle: + if self._should_process_tree(tree): + self.process_tree(tree) + + def process_tree(self, tree): + if self.empty_node_ord: + node_ord, empty_ord = 0, 0 + for node in tree.descendants_and_empty: + if node.is_empty(): + empty_ord += 1 + old_empty_ord, new_empty_ord = str(node.ord), f"{node_ord}.{empty_ord}" + if old_empty_ord != new_empty_ord: + # Make sure all nodes in this sentence have deserialized enhanced deps. + for n in tree.descendants_and_empty: + n.deps + node.ord = new_empty_ord + else: + empty_ord = 0 + node_ord = node.ord + for node in tree.descendants: + self.process_node(node) + + def process_node(self, node): + if self.feats: + node.feats._deserialize_if_empty() + node.feats._string = None + if self.misc: + node.misc._deserialize_if_empty() + node.misc._string = None diff --git a/udapi/block/util/see.py b/udapi/block/util/see.py index aa7131b7..9a895b88 100644 --- a/udapi/block/util/see.py +++ b/udapi/block/util/see.py @@ -51,7 +51,7 @@ class See(Block): """Print statistics about the nodes specified by the parameter `node`.""" - def __init__(self, node, n=5, stats=STATS, **kwargs): + def __init__(self, node, n=5, stats=STATS, empty=False, **kwargs): """Args: `node`: Python expression to be evaluated for each node and if True, the node will be considered "matching". @@ -62,6 +62,7 @@ def __init__(self, node, n=5, stats=STATS, **kwargs): `children` = number of children nodes, `p_lemma` = lemma of a parent node, etc). See `udapi.core.Node.get_attrs` for a full list of statistics. + `empty`: apply the code also on empty nodes """ super().__init__(**kwargs) self.node = node @@ -73,11 +74,13 @@ def __init__(self, node, n=5, stats=STATS, **kwargs): self.match[stat] = Counter() self.every[stat] = Counter() self.overall = Counter() + self.empty = empty def process_tree(self, root): self.overall['trees'] += 1 tree_match = False - for node in root.descendants: + nodes = root.descendants_and_empty if self.empty else root.descendants + for node in nodes: matching = self.process_node(node) self.overall['nodes'] += 1 if matching: diff --git a/udapi/block/util/splitsentence.py b/udapi/block/util/splitsentence.py new file mode 100644 index 00000000..b6ca57d8 --- /dev/null +++ b/udapi/block/util/splitsentence.py @@ -0,0 +1,134 @@ +""" +Block util.SplitSentence will split a given sentence at a given token. +""" +import logging +from udapi.core.block import Block +from udapi.core.root import Root + +class SplitSentence(Block): + """ + If the sent_id of the current sentence matches the parameter, splits the + sentence into two. The first token of the second sentence is also given as + a parameter. + + Alternatively, a MISC attribute can be specified that triggers sentence + splitting at the given token. With this approach, multiple sentence splits + can be performed during one run. + """ + + def __init__(self, sent_id=None, word_id=None, misc_name=None, misc_value=None, **kwargs): + """ + Args: + sent_id: which sentence should be split (new ids will have A and B appended) + word_id: which word should be the first word of the second sentence (tokens and words will be renumbered) + misc_name: name of the MISC attribute that can trigger the split (cannot be combined with sent_id and word_id) + misc_value: value of the MISC attribute to trigger the split; if not specified, then simple occurrence of the attribute with any value will cause the split + MISC attributes that have triggered sentence split will be removed from their node. + """ + super().__init__(**kwargs) + if misc_name: + if sent_id or word_id: + logging.fatal('Cannot combine misc_value with sent_id or word_id') + else: + if not sent_id: + logging.fatal('Missing parameter sent_id') + if not word_id: + logging.fatal('Missing parameter word_id') + self.sent_id = sent_id + self.word_id = word_id + self.misc_name = misc_name + self.misc_value = misc_value + + def process_document(self, document): + for bundle_no, bundle in enumerate(document.bundles): + # In general, a bundle may contain multiple trees in different zones. + # In UD data, we always expect just one zone (labeled '') per bundle. + # This code could be extended to split all zones but we do not try to do it at present. + # (The zones may be translations to other languages and it is not likely that we would + # want to split each translation at the same position.) + if len(bundle.trees) != 1: + logging.fatal('Cannot process bundles that have less or more than 1 zone') + if not bundle.has_tree(zone=''): + logging.fatal('Cannot process bundles that do not have the zone with empty zone id') + if self.misc_name: + root = bundle.get_tree() + split_points = [n for n in root.descendants if n.ord > 1 and n.misc[self.misc_name] and self.misc_value == None or n.misc[self.misc_name] == self.misc_value] + if split_points: + # Create as many new bundles as there are split points. + n_new = len(split_points) + current_bid = bundle.bundle_id + idletter = 'B' # a letter will be added to bundle ids to distinguish them + for i in range(n_new): + new_bundle = document.create_bundle() + new_bundle.bundle_id = current_bid + idletter + new_root = Root(zone='') + new_bundle.add_tree(new_root) + # Identify nodes to move to the new bundle. + first_node_id = split_points[i].ord + if i < n_new - 1: + next_first_node_id = split_points[i+1].ord + nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id and n.ord < next_first_node_id] + else: + nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id] + new_root.steal_nodes(nodes_to_move) + self.make_zeros_roots(new_root) + new_root.text = new_root.compute_text() + # The new bundle was created at the end of the document. + # Move it to the position right after the current bundle. + document.bundles.pop() + document.bundles.insert(bundle_no + i + 1, new_bundle) + idletter = chr(ord(idletter) + 1) + # Remove from the node the MISC attribute that triggered the sentence split. + split_points[i].misc[self.misc_name] = '' + # Update the id of the current bundle, fix its zero-dependents and recompute sentence text. + bundle.bundle_id += 'A' + self.make_zeros_roots(root) + root.text = root.compute_text() + # Update the bundle numbers of the new bundles and all bundles after them. + updated_no = bundle_no + 1 + for b in document.bundles[(bundle_no+1):]: + b.number = updated_no + updated_no += 1 + elif bundle.bundle_id == self.sent_id: + logging.info('Found!') + root = bundle.get_tree() + nodes_to_move = [n for n in root.descendants if n.ord >= self.word_id] + if len(nodes_to_move) == 0: + logging.fatal('No nodes to move to the new sentence; word_id may be out of range') + # Create a new bundle at the end of the current document. + new_bundle = document.create_bundle() + # Move the new bundle to the position right after the current bundle. + new_bundle_no = bundle_no + 1 + document.bundles.pop() + document.bundles.insert(new_bundle_no, new_bundle) + updated_no = new_bundle_no + for b in document.bundles[new_bundle_no:]: + b.number = updated_no + updated_no += 1 + new_bundle.bundle_id = bundle.bundle_id + 'B' + bundle.bundle_id += 'A' + new_root = Root(zone='') + new_bundle.add_tree(new_root) + new_root.steal_nodes(nodes_to_move) + # The steal_nodes() method does not make sure that all nodes newly attached + # to the artificial root have the 'root' relation. Fix it. + self.make_zeros_roots(root) + self.make_zeros_roots(new_root) + # Update the sentence text attributes of the new sentences. + root.text = root.compute_text() + new_root.text = new_root.compute_text() + # We have found our sentence. No need to process the rest of the document. + break + + def make_zeros_roots(self, root): + """ + The steal_nodes() method does not make sure that all nodes newly attached + to the artificial root have the 'root' relation. Fix it. + """ + n_root = 0 + for n in root.descendants: + if n.parent.is_root(): + n.deprel = 'root' + n_root += 1 + if n_root > 1: + logging.warning('More than one 0:root relation in newly segmented sentence %s.' % root.bundle.bundle_id) diff --git a/udapi/block/util/wc.py b/udapi/block/util/wc.py index 137c95e9..9920d0b6 100644 --- a/udapi/block/util/wc.py +++ b/udapi/block/util/wc.py @@ -13,6 +13,7 @@ def __init__(self, tsv=False, **kwargs): """ super().__init__(**kwargs) self.trees, self.words, self.mwts, self.tokens, self.empty = 0, 0, 0, 0, 0 + self.docs, self.paragraphs = 0, 0 self.tsv = tsv def process_tree(self, tree): @@ -22,13 +23,21 @@ def process_tree(self, tree): self.mwts += mwtoks self.tokens += len(tree.token_descendants) if mwtoks else len(tree.descendants) self.empty += len(tree.empty_nodes) + if tree.newdoc or tree == tree.document[0].trees[0]: + self.docs += 1 + if tree.newpar: + self.paragraphs += 1 def process_end(self): if self.tsv: - print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty)))) + print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty, self.docs, self.paragraphs)))) else: print('%8d trees\n%8d words' % (self.trees, self.words)) if self.mwts: print('%8d multi-word tokens\n%8d tokens' % (self.mwts, self.tokens)) if self.empty: print('%8d empty nodes' % self.empty) + if self.docs: + print('%8d documents' % self.docs) + if self.paragraphs: + print('%8d paragraphs' % self.paragraphs) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index 66ae320b..2573b5ae 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -26,7 +26,7 @@ def process_tree(self, tree): # pylint: disable=too-many-branches # If tree.comment contains placeholders $NEWDOC,...$TEXT, replace them with the actual # value of the attribute and make note on which line (i_*) they were present. comment_lines = tree.comment.splitlines() - i_newdoc, i_newpar, i_sent_id, i_text = -1, -1, -1, -1 + i_newdoc, i_newpar, i_sent_id, i_text, i_global_entity = -1, -1, -1, -1, -1 for i, c_line in enumerate(comment_lines): if c_line == '$SENT_ID': i_sent_id = i @@ -50,6 +50,13 @@ def process_tree(self, tree): # pylint: disable=too-many-branches comment_lines[i] = ' newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '') else: comment_lines[i] = None + elif c_line == '$GLOBAL.ENTITY': + i_global_entity = i + ge = tree.document.meta.get('global.Entity') + if ge: + comment_lines[i] = ' global.Entity = ' + ge + else: + comment_lines[i] = None # Now print the special comments: global.columns, newdoc, newpar, sent_id and text. # If these comments were already present in tree.comment (as marked with the placeholders), @@ -68,6 +75,15 @@ def process_tree(self, tree): # pylint: disable=too-many-branches printed_i += 1 if comment_lines[printed_i]: print('#' + comment_lines[printed_i]) + ge = tree.document.meta.get('global.Entity') + if ge: + if i_global_entity == -1: + print('# global.Entity = ' + ge) + else: + while printed_i < i_global_entity: + printed_i += 1 + if comment_lines[printed_i]: + print('#' + comment_lines[printed_i]) if tree.newpar: if i_newpar == -1: print('# newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '')) @@ -118,10 +134,10 @@ def process_tree(self, tree): # pylint: disable=too-many-branches '_' if node._feats is None else str(node.feats), head, node.deprel, node.raw_deps, '_' if node._misc is None else str(node.misc)))) - # Empty sentences are not allowed in CoNLL-U, + # Empty sentences (sentences with no non-empty nodes) are not allowed in CoNLL-U, # but with print_empty_trees==1 (which is the default), # we will print an artificial node, so we can print the comments. - if not nodes: + if not tree._descendants: print("1\t_\t_\t_\t_\t_\t0\t_\t_\tEmpty=Yes") # Empty line separates trees in CoNLL-U (and is required after the last tree as well) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py new file mode 100644 index 00000000..1d5d4716 --- /dev/null +++ b/udapi/block/write/corefhtml.py @@ -0,0 +1,478 @@ +"""CorefHtml class is a writer for HTML+JavaScript visualization of coreference. + +When using lazy loading of documents (infinite scrolling), +modern browsers don't allow JavaScript to load files from a local file system +("Access to XMLHttpRequest at 'file://.../doc2.html' from origin 'null' has been +blocked by CORS policy: Cross origin requests are only supported for protocol schemes: +http, data, chrome, chrome-extension, https.") + +The recommended solution is to start a local web server, e.g. using + python -m http.server +and browse http://0.0.0.0:8000/my.html. + +Non-recommended solution is to run + google-chrome --new-window --user-data-dir=/tmp/chrome-proxy --allow-file-access-from-files my.html +""" +from udapi.core.basewriter import BaseWriter +from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention +from collections import Counter +import udapi.block.write.html +import gzip +import sys +import os +import re + +ETYPES = 'person place organization animal plant object substance time number abstract event'.split() + +HTYPES = 'PROPN NOUN PRON VERB DET OTHER'.split() + +HEADER = ''' + +Udapi CorefUD viewer + + +''' + +CSS = ''' +#wrap {display: flex; align-items: flex-start;} +#main {width: 100%; padding: 5px; background: white; z-index:100;} +#overview { position: sticky; top: 0; overflow-y: scroll; height:95vh; resize:horizontal; + display: grid; border-right: double; + padding: 5px; width: 20em; background: #ddd; border-radius: 5px; +} +#main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none; + padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;} +#main-menu div {display: inline-block;} +#menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;} +#menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;} +.change .b1 {transform: translate(0, 9px) rotate(-45deg);} +.change .b2 {opacity: 0;} +.change .b3 {transform: translate(0, -9px) rotate(45deg);} + +.m {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.nobox {border:1px solid transparent; padding:0; background: transparent !important; display: inline} +.nobox .labels {display: inline;} +.nocolor {color: black !important;} +.nobold {font-weight: normal;} +.labels {display: block; font-size: 10px;} +.showtree {margin: 5px; user-select: none;} +.display-inline {display: inline;} +.close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} +i.empty {color: gray; border: 3px outset gray; padding: 1px;} +.sentence .singleton {border-style: dotted;} +.crossing:before {content: "!"; display: block; background: #ffd500;} +.active {border: 1px solid red !important;} +.selected {background: red !important; text-shadow: 1px 1px 4px white, -1px 1px 4px white, 1px -1px 4px white, -1px -1px 4px white;} +.sent_id {display: none; background: #ddd; border-radius: 3px;} +''' + +SCRIPT_BASE = ''' +function add_mention_listeners(mentions){ + mentions.click(function(e) { + let was_selected = $(this).hasClass("selected"); + $(".m").removeClass("selected"); + if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} + e.stopPropagation(); + }); + mentions.hover( + function(e) {$(".m").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, + function(e) {$(".m").removeClass("active");} + ); +} +add_mention_listeners($(".m")); + +window.onhashchange = function() { + $(".m").removeClass("selected"); + var fragment = window.location.hash.substring(1); + if (fragment) {$("." + fragment).addClass("selected");} +} + +function menuclick(x) { + x.classList.toggle("change"); + $("#main-menu").toggle(); +} + +async function load_doc(doc_num) { + loading_now = true; + let filename = docs_dir + "/doc" + doc_num + ".html.gz" + console.log("loading " + filename); + try { + const res = await fetch(filename); + let raw = await res.arrayBuffer(); + data = pako.inflate(raw, {to: "string"}); + } catch (error){ + if (! load_fail_reported) { + load_fail_reported = true; + alert("Cannot load " + filename + "\\nLocal files do not support lazy loading." + + " Run a web server 'python -m http.server'\\n" + + "error = " + error); + } + } + $("#main").append(data); + add_mention_listeners($("#doc" + doc_num + " .m")); + $("#doc" + doc_num + " .sentence").each(add_show_tree_button); + $('.eid').toggle($('#show-eid')[0].checked); + $('.etype').toggle($('#show-etype')[0].checked); + $('.sent_id').toggle($('#show-sent_id')[0].checked); + $('.showtree').toggle($('#show-trees')[0].checked); + $('.m').toggleClass('nocolor', ! $('#show-color')[0].checked); + $('.m').toggleClass('nobox', ! $('#show-boxes')[0].checked); + $('.norm').toggle($('#show-norm')[0].checked); + $('.head').toggleClass('nobold', ! $('#show-heads')[0].checked); + $('.empty').toggle($('#show-empty')[0].checked); + $('.sentence').toggleClass('display-inline', ! $('#show-breaks')[0].checked); + $('.par').toggle($('#show-pars')[0].checked); + $('h1').toggle($('#show-docs')[0].checked); + $('.m').toggleClass('htype',$('#htype')[0].checked) + loading_now = false; +} + +var docs_loaded = 1; +var load_fail_reported = false; +var loading_now = false; +add_show_tree_button = function(index, el){ // to be redefined later if show_trees=True + $(el).prepend('🆔' + el.dataset.id + ''); +} +function load_more() { + if (!loading_now && $(window).scrollTop() >= $(document).height() - $(window).height() - 42 && docs_loaded < all_docs) { + docs_loaded += 1; + load_doc(docs_loaded); + } +} +$(window).scroll(load_more); +const resizeObserver = new ResizeObserver(entries =>load_more()); +resizeObserver.observe(document.body); +''' + +SCRIPT_SHOWTREE = ''' +function show_tree_in_tdiv(tdiv, doc_number, index){ + tdiv.treexView([docs_json[doc_number][index]]); + $("\n' + ) + + # The first ud_doc will be printed to the main html file. + self.process_ud_doc(ud_docs[0], 1) + print('') # id=main + + # Other ud_docs will be printed into separate files (so they can be loaded lazily) + orig_stdout = sys.stdout + try: + for i, ud_doc in enumerate(ud_docs[1:], 2): + sys.stdout = gzip.open(f"{self.docs_dir}/doc{i}.html.gz", 'wt') + self.process_ud_doc(ud_doc, i) + sys.stdout.close() + finally: + sys.stdout = orig_stdout + + print(f'') + print('') + + def _start_subspan(self, subspan, crossing=False): + m = subspan.mention + e = m.entity + classes = f'{_dom_esc(e.eid)} {self._mention_ids[m]} {e.etype or "other"} m' + title = f'eid={subspan.subspan_eid}\netype={e.etype}\nhead={m.head.form}' + classes += f" {m.head.upos if m.head.upos in HTYPES else 'OTHER'}" + title += f'\nhead-upos={m.head.upos}' + if self.colors: + classes += f' {self._entity_colors[e]}' + if all(w.is_empty() for w in subspan.words): + classes += ' empty' + if len(e.mentions) == 1: + classes += ' singleton' + if crossing: + classes += ' crossing' + title += '\ncrossing' + if m.other: + title += f'\n{m.other}' + span_id = '' + if (subspan.subspan_id == '' or subspan.subspan_id.startswith('[1/')) and e.mentions[0] == m: + span_id = f'id="{_dom_esc(e.eid)}" ' + # The title should be always rendered left-to-right (e.g. "head=X", not "X=head"), + # so for RTL languages, we need to use explicit dir="ltr" and insert a nested span with dir="rtl". + if self.rtl: + print(f'' + f'{_dom_esc(subspan.subspan_eid)}' + f' {e.etype}', end='') + else: + print(f'' + f'{_dom_esc(subspan.subspan_eid)}' + f' {e.etype}', end='') + + def process_tree(self, tree): + mentions = set() + nodes_and_empty = tree.descendants_and_empty + for node in nodes_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + subspans = [] + for mention in mentions: + subspans.extend(mention._subspans()) + subspans.sort(reverse=True) + + if tree.newdoc: + print(f'


{tree.newdoc if tree.newdoc is not True else ""}


') + elif tree.newpar: + print('
') + opened, prev_node_mention = [], True + rtl = ' dir="rtl"' if self.rtl else "" + print(f'

') + for node in nodes_and_empty: + if not prev_node_mention and subspans and subspans[-1].words[0] == node: + print('', end='') + while subspans and subspans[-1].words[0] == node: + subspan = subspans.pop() + self._start_subspan(subspan) + opened.append(subspan) + + if not opened and prev_node_mention: + print('', end='') + prev_node_mention = True if opened else False + is_head = self._is_head(node) + if is_head: + print('', end='') + if node.is_empty(): + print('', end='') + print(node.form, end='') + if node.is_empty(): + print('', end='') + if is_head: + print('', end='') + + while opened and opened[-1].words[-1] == node: + if self.rtl: + print('', end='') + else: + print('', end='') + opened.pop() + + # Two mentions are crossing iff their spans have non-zero intersection, + # but neither is a subset of the other, e.g. (e1 ... (e2 ... e1) ... e2). + # Let's visualize this (simplified) as + # ......... + # i.e. let's split mention e2 into two subspans which are next to each other. + # Unfortunatelly, we cannot mark now both crossing mentions using html class "crossing" + # (opening tags are already printed), so we'll mark only the second part of the second mention. + endings = [x for x in opened if x.words[-1] == node] + if endings: + new_opened, brokens, found_crossing = [], [], False + for subspan in opened: + if subspan.words[-1] == node: + found_crossing = True + elif found_crossing: + brokens.append(subspan) + else: + new_opened.append(subspan) + opened = new_opened + print('' * (len(endings) + len(brokens)), end='') + for broken in brokens: + self._start_subspan(broken, True) + opened.append(subspan) + + if not node.no_space_after: + print(' ', end='') + + if not prev_node_mention: + print('', end='') + print('

') + + def _is_head(self, node): + for mention in node.coref_mentions: + if mention.head == node: + return mention + return None + + +# id needs to be a valid DOM querySelector +# so it cannot contain [#./:] and maybe more, +# so let's substitute all [^\w\d-] to be on the safe side. +# DOM IDs cannot start with a digit, so prepend e.g. "n" if needed. +def _dom_esc(string): + if string[0].isdecimal(): + string = 'n' + string + return re.sub(r'[^\w\d-]', '_', string) + +def _id(node): + if node is None: + return 'null' + return _dom_esc(node.address()) + +def _esc(string): + if string is None: + string = '' + return string.replace('\\', '\\\\').replace('"', r'\"') diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 148b29ee..ae85d43c 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -79,16 +79,32 @@ def process_document(self, doc): print('\n') print('
') + + def print_doc_json(self, doc): + print('[') for (bundle_number, bundle) in enumerate(doc, 1): - # TODO: if not self._should_process_bundle(bundle): continue if bundle_number != 1: print(',', end='') print('{"zones":{', end='') first_zone = True desc = '' - for tree in bundle.trees: - # TODO: if not self._should_process_tree(tree): continue + try: + trees = bundle.trees + except: + trees = [bundle] # allow to call print_doc_json([tree1, tree2]) + for tree in trees: zone = tree.zone if first_zone: first_zone = False @@ -101,24 +117,16 @@ def process_document(self, doc): print('"labels":["zone=%s","id=%s"]}' % (zone, tree.address())) desc += ',["[%s]","label"],[" ","space"]' % zone for node in tree.descendants: - desc += self.print_node(node) + desc += self.print_node_json(node) desc += r',["\n","newline"]' print(']}}}') # print desc without the extra starting comma print('},"desc":[%s]}' % desc[1:]) - print('];') - print("$('#treex-view').treexView(data);") - print('''function saveTree() { - var svg_el = jQuery('svg'); - if (svg_el.length) { - var svg = new Blob([svg_el.parent().html()], {type: "image/svg+xml"}); - saveAs(svg, 'tree.svg'); - } - }''') - print('') + print(']') + @staticmethod - def print_node(node): + def print_node_json(node): """JSON representation of a given node.""" # pylint does not understand `.format(**locals())` and falsely alarms for unused vars # pylint: disable=too-many-locals,unused-variable diff --git a/udapi/block/write/oldcorefud.py b/udapi/block/write/oldcorefud.py new file mode 100644 index 00000000..49f9beb0 --- /dev/null +++ b/udapi/block/write/oldcorefud.py @@ -0,0 +1,58 @@ +"""Writer for CoNLL-U files with the old CorefUD 0.1 style of coreference annotation.""" +import re +import logging +import udapi.block.write.conllu + +class OldCorefUD(udapi.block.write.conllu.Conllu): + + def process_document(self, doc): + if not doc.coref_entities: + logging.warning("Using write.OldCorefUD on a document without any coreference annotation") + + # Delete both new-style (GUM-style) and old-style (CorefUD 0.1) coreference annotations from MISC. + attrs = "Entity Split Bridge ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() + for node in doc.nodes_and_empty: + for key in list(node.misc): + if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): + del node.misc[key] + del doc.meta['global.Entity'] + + # doc._eid_to_entity is a dict, which is insertion ordered in Python 3.7+. + # The insertion order is sorted according to CorefEntity.__lt__ (see few lines above). + # However, new entities could be added meanwhile or some entities edited, + # so we need to sort the entities again before storing to MISC. + # We also need to mare sure entity.mentions are sorted in each entity + # because the ordering of entities is defined by the first mention in each entity. + # Ordering of mentions within a entity can be changed when e.g. changing the span + # of a given mention or reordering words within a sentence and in such events + # Udapi currently does not automatically update the ordering of entities. + for entity in doc.coref_entities: + entity._mentions.sort() + for entity in sorted(doc.coref_entities): + for mention in entity.mentions: + head = mention.head + if head.misc["ClusterId"]: + for a in attrs: + if head.misc[a]: + head.misc[a + "[1]"] = head.misc[a] + del head.misc[a] + index_str = "[2]" + else: + index, index_str = 1, "[1]" + while(head.misc["ClusterId" + index_str]): + index += 1 + index_str = f"[{index}]" + if index == 1: + index_str = "" + head.misc["ClusterId" + index_str] = entity.eid + head.misc["MentionSpan" + index_str] = mention.span + head.misc["ClusterType" + index_str] = entity.etype + if mention._bridging: + head.misc["Bridging" + index_str] = ','.join(f'{l.target.eid}:{l.relation}' for l in sorted(mention.bridging)) + if entity.split_ante: + serialized = ','.join((c.eid for c in sorted(entity.split_ante))) + head.misc["SplitAnte" + index_str] = serialized + if mention.other: + head.misc["MentionMisc" + index_str] = str(mention.other).replace('%2D', '-') + + super().process_document(doc) diff --git a/udapi/block/write/sentences.py b/udapi/block/write/sentences.py index 60eb6bec..70553d7d 100644 --- a/udapi/block/write/sentences.py +++ b/udapi/block/write/sentences.py @@ -3,13 +3,14 @@ class Sentences(BaseWriter): - """A writer of plain-text sentences (one per line). + """A writer of plain-text sentences (one sentence per line). Usage: udapy write.Sentences if_missing=empty < my.conllu > my.txt + udapy write.Sentences newdoc=1 newpar=1 < my.conllu > my.txt """ - def __init__(self, if_missing='detokenize', **kwargs): + def __init__(self, if_missing='detokenize', newdoc=None, newpar=None, **kwargs): """Create the Sentences writer block. Parameters: @@ -18,9 +19,21 @@ def __init__(self, if_missing='detokenize', **kwargs): * `empty`: print an empty line * `warn_detokenize`, `warn_empty`: in addition emit a warning via `logging.warning()` * `fatal`: raise an exception + newdoc: What to do if `root.newdoc` is not None? (default=None) + * None: ignore it + * True: print an empty_line (except for the first tree, i.e. bundle.number==1) + newpar: What to do if `root.newpar` is not None? (default=None) + * None: ignore it + * True: print an empty_line (except for the first tree, i.e. bundle.number==1) """ super().__init__(**kwargs) self.if_missing = if_missing + self.newdoc = newdoc + self.newpar = newpar def process_tree(self, tree): + if self.newdoc and tree.newdoc and tree.bundle.number > 1: + print() + if self.newpar and tree.newpar and tree.bundle.number > 1: + print() print(tree.get_sentence(self.if_missing)) diff --git a/udapi/block/write/sentenceshtml.py b/udapi/block/write/sentenceshtml.py new file mode 100644 index 00000000..e0f87241 --- /dev/null +++ b/udapi/block/write/sentenceshtml.py @@ -0,0 +1,37 @@ +"""SentencesHtml class is a writer for sentences in HTML list (could be Google-translated, remembering sentence correspondence).""" +from udapi.core.basewriter import BaseWriter + + +class SentencesHtml(BaseWriter): + """A writer of sentences in HTML list (one per item). + + Usage: + udapy write.SentencesHtml if_missing=empty < my.conllu > my.html + """ + + def __init__(self, title='Sentences from CoNLL-U', if_missing='detokenize', **kwargs): + """Create the SentencesHtml writer block. + + Parameters: + if_missing: What to do if `root.text` is `None`? (default=detokenize) + * `detokenize`: use `root.compute_text()` to compute the sentence. + * `empty`: print an empty line + * `warn_detokenize`, `warn_empty`: in addition emit a warning via `logging.warning()` + * `fatal`: raise an exception + """ + super().__init__(**kwargs) + self.title = title + self.if_missing = if_missing + + def before_process_document(self, document): + super().before_process_document(document) + print('\n\n\n') + print('' + self.title + '') + print('\n\n
    \n') + + def after_process_document(self, document): + print("
\n\n") + super().after_process_document(document) + + def process_tree(self, tree): + print('
  • %s
  • ' % (tree.sent_id, tree.get_sentence(self.if_missing))) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index fb38c22a..885f797f 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -1,4 +1,5 @@ """An ASCII pretty printer of dependency trees.""" +import os import re import sys @@ -13,6 +14,7 @@ 'upos': 'red', 'deprel': 'blue', 'ord': 'green', + 'misc[Entity]': 'magenta', } # Too many instance variables, arguments, branches... @@ -127,6 +129,14 @@ class TextModeTrees(BaseWriter): which is useful for printing subtrees using ``node.draw()``, which is internally implemented using this block. + For use in LaTeX, you can insert the output of this block (without colors) + into \begin{verbatim}...\end{verbatim}, but you need to compile with pdflatex (xelatex not supported) + and you must add the following code into the preambule:: + + \\usepackage{pmboxdraw} + \DeclareUnicodeCharacter{256D}{\textSFi} %╭ + \DeclareUnicodeCharacter{2570}{\textSFii} %╰ + SEE ALSO :py:class:`.TextModeTreesHtml` """ @@ -205,7 +215,7 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind self.mark_re = re.compile(mark + '=') self.comment_mark_re = re.compile(r'^ %s = ' % mark, re.M) self._index_of = [] - self._gaps = [] + self._gaps = collections.Counter() self.lines = [] self.lengths = [] @@ -235,7 +245,7 @@ def should_print_tree(self, root, allnodes): return False return self.comment_mark_re.search(root.comment) - def process_tree(self, root): + def process_tree(self, root, force_print=False): """Print the tree to (possibly redirected) sys.stdout.""" if self.print_empty: if root.is_root(): @@ -247,7 +257,7 @@ def process_tree(self, root): allnodes.sort() else: allnodes = root.descendants(add_self=1) - if not self.should_print_tree(root, allnodes): + if not force_print and not self.should_print_tree(root, allnodes): return self._index_of = {allnodes[i].ord: i for i in range(len(allnodes))} self.lines = [''] * len(allnodes) @@ -255,7 +265,6 @@ def process_tree(self, root): # Precompute the number of non-projective gaps for each subtree if self.minimize_cross: - self._gaps = collections.Counter() self._compute_gaps(root) # Precompute lines for printing @@ -291,7 +300,7 @@ def process_tree(self, root): # sorting the stack to minimize crossings of edges if self.minimize_cross: - stack = sorted(stack, key=lambda x: -self._gaps[x.ord]) + stack.sort(key=lambda x: -self._gaps[x.ord]) if self.layout == 'classic': for idx, node in enumerate(allnodes): @@ -337,11 +346,16 @@ def before_process_document(self, document): super().before_process_document(document) if self.color == 'auto': self.color = sys.stdout.isatty() - if self.color: - colorama.init() + if self.color: + colorama.just_fix_windows_console() + # termcolor since 2.1 also autodetects whether sys.stdout.isatty() + # and if not, it disables the colors, so `cat i.conllu | udapy -T | less -R" + # does not work. We need to turn off termcolor's autodetection with FORCE_COLOR. + os.environ["FORCE_COLOR"] = "1" if self.print_doc_meta: for key, value in sorted(document.meta.items()): - print('%s = %s' % (key, value)) + if key[0] != '_': + print('%s = %s' % (key, value)) def _add(self, idx, text): self.lines[idx] += text @@ -352,11 +366,15 @@ def add_node(self, idx, node): if not node.is_root(): values = node.get_attrs(self.attrs, undefs=self.print_undef_as) self.lengths[idx] += 1 + len(' '.join(values)) + marked = self.is_marked(node) if self.color: - marked = self.is_marked(node) for i, attr in enumerate(self.attrs): values[i] = self.colorize_attr(attr, values[i], marked) - self.lines[idx] += ' ' + ' '.join(values) + if not self.color and marked: + self.lines[idx] += ' **' + ' '.join(values) + '**' + self.lengths[idx] += 4 + else: + self.lines[idx] += ' ' + ' '.join(values) def is_marked(self, node): """Should a given node be highlighted?""" diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index 75a39a97..0ad39da4 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -26,7 +26,7 @@ class TextModeTreesHtml(TextModeTrees): This block is a subclass of `TextModeTrees`, see its documentation for more info. """ - def __init__(self, color=True, title='Udapi visualization', **kwargs): + def __init__(self, color=True, title='Udapi visualization', zones_in_rows=True, whole_bundle=True, **kwargs): """Create new TextModeTreesHtml block object. Args: see `TextModeTrees`. @@ -35,9 +35,14 @@ def __init__(self, color=True, title='Udapi visualization', **kwargs): (see the `mark` parameter) to be more eye-catching. title: What title metadata to use for the html? + zones_in_rows: print trees from the same bundle side by side (i.e. in the same row). + whole_bundle: always print the whole bundle (all its trees) if any of the trees is marked + (relevant only with marked_only=True and zones_in_rows=True) """ super().__init__(color=color, **kwargs) self.title = title + self.zones_in_rows = zones_in_rows + self.whole_bundle = whole_bundle def before_process_document(self, document): # TextModeTrees.before_process_document changes the color property, @@ -53,8 +58,8 @@ def before_process_document(self, document): print('%s = %s' % (key, value)) def after_process_document(self, document): - super().after_process_document(document) print("\n\n") + super().after_process_document(document) def add_node(self, idx, node): if not node.is_root(): @@ -82,3 +87,27 @@ def print_headers(self, root): print(escape(text)) if self.print_comments and root.comment: print('#' + self.colorize_comment(escape(root.comment)).rstrip().replace('\n', '\n#')) + + def process_bundle(self, bundle): + if self.zones_in_rows: + # Don't print
    if no tree will be printed in this bundle. + marked_trees = [] + for tree in bundle: + if self._should_process_tree(tree): + if self.print_empty: + allnodes = [tree] + tree.descendants_and_empty + else: + allnodes = tree.descendants(add_self=1) + if self.should_print_tree(tree, allnodes): + marked_trees.append(tree) + if marked_trees: + if self.whole_bundle: + marked_trees = bundle + print("") + for tree in marked_trees: + print("") + print("
    ") + self.process_tree(tree, force_print=True) + print("
    ") + else: + super().process_bundle(bundle) diff --git a/udapi/block/write/tikz.py b/udapi/block/write/tikz.py index 58f53a3d..40071739 100644 --- a/udapi/block/write/tikz.py +++ b/udapi/block/write/tikz.py @@ -39,7 +39,8 @@ class Tikz(BaseWriter): """ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, - attributes=None, as_tree=False, comment_attribute=None, **kwargs): + attributes=None, as_tree=False, comment_attribute=None, + enhanced=False, **kwargs): """Create the Tikz block object. Args: @@ -50,6 +51,7 @@ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, attributes: comma-separated list of node attributes to print (each on a separate line). as_tree: boolean - should print it as a 2D tree? comment_attribute: which attribute to print as a string under each graph (e.g. text_en) + enhanced: boolean - print the enhanced graph below the sentence, too? """ super().__init__(**kwargs) self.print_sent_id = print_sent_id @@ -63,6 +65,9 @@ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, self.node_attributes = 'form,upos'.split(',') self.as_tree = as_tree self.comment_attribute = comment_attribute + if as_tree and enhanced: + raise ValueError("The enhanced graph cannot be printed as a tree") + self.enhanced = enhanced def before_process_document(self, doc): super().before_process_document(doc) @@ -91,6 +96,9 @@ def after_process_document(self, doc): logging.info('Use pdflatex to compile the output') super().after_process_document(doc) + def _tex_escape(self, string): + return string.replace('_', r'\_').replace('$', '\$').replace('[', '$[$').replace(']', '$]$') + def process_tree(self, tree): print(r'\begin{dependency}') print(r'\begin{deptext}') @@ -109,7 +117,7 @@ def process_tree(self, tree): lines = ['' for _ in self.node_attributes] for node in nodes: - values = [v.replace('_', r'\_') for v in node.get_attrs(self.node_attributes)] + values = [self._tex_escape(v) for v in node.get_attrs(self.node_attributes)] max_len = max(len(value) for value in values) for index, value in enumerate(values): if node.ord > 1: @@ -137,6 +145,12 @@ def process_tree(self, tree): print(r'\deproot{%d}{root}' % node.ord) else: print(r'\depedge{%d}{%d}{%s}' % (node.parent.ord, node.ord, node.deprel)) + if self.enhanced: + for dep in node.deps: + if dep['parent'].is_root(): + print(r'\deproot[edge below]{%d}{root}' % node.ord) + else: + print(r'\depedge[edge below]{%d}{%d}{%s}' % (dep['parent'].ord, node.ord, dep['deprel'])) if self.comment_attribute and tree.comment: start_pos = tree.comment.find(self.comment_attribute + ' = ') if start_pos != -1: diff --git a/udapi/block/write/vislcg.py b/udapi/block/write/vislcg.py index 569b1056..acdf1e80 100644 --- a/udapi/block/write/vislcg.py +++ b/udapi/block/write/vislcg.py @@ -64,10 +64,7 @@ def process_tree(self, tree): # Print the line with forms and optional upos tags and feats. for token in tree.token_descendants: print('"<%s>"' % self._escape(token.form)) - try: - words = token.words - except AttributeError: - words = [token] + words = token.words print('\t' + self._node(words[0])) for nonfirst_mwt_word in words[1:]: print('\t\t' + self._node(nonfirst_mwt_word)) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 05f204b9..c3bcf918 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -13,7 +13,8 @@ class BaseReader(Block): # pylint: disable=too-many-arguments def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', - sent_id_filter=None, split_docs=False, ignore_sent_id=False, **kwargs): + sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, + max_docs=0, **kwargs): super().__init__(**kwargs) if filehandle is not None: files = None @@ -28,6 +29,16 @@ def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, e logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id + self.merge = merge + self.max_docs = max_docs + self._docs_loaded = 0 + # `global.Entity` is a header stored in a comment before the first tree of each document in a given CoNLL-U file. + # In Udapi, it is stored in `document.meta['global.Entity']`, but for technical reasons, we need to temporarily store it here, in the reader. + # The reason is that `read.Conllu` uses a fast loading interface with `read_trees()`, + # which reads all the trees in a file at once, but it does not have access to the document instance, + # it just returns a sequence of trees (which may be split into multiple documents if `bundles_per_doc` is set). + # So `read.Conllu` cannot store the `global.Entity` in `document.meta['global.Entity']` where it belongs. + self._global_entity = None @staticmethod def is_multizone_reader(): @@ -86,13 +97,19 @@ def filtered_read_tree(self): tree = self.read_tree() if self.sent_id_filter is None: return tree + + skipped_newdoc = None while True: if tree is None: return None if self.sent_id_filter.match(tree.sent_id) is not None: + if skipped_newdoc and not tree.newdoc: + tree.newdoc = skipped_newdoc return tree logging.debug('Skipping sentence %s as it does not match the sent_id_filter %s.', tree.sent_id, self.sent_id_filter) + if tree.newdoc: + skipped_newdoc = tree.newdoc tree = self.read_tree() def try_fast_load(self, document): @@ -104,42 +121,56 @@ def try_fast_load(self, document): if filehandle is None: self.finished = True return True - try: - trees = self.read_trees() - except NotImplementedError: - return False + logging.info(f"Reading {self.files.filename}") - document.meta['loaded_from'] = self.filename - if trees and trees[0].newdoc and trees[0].newdoc is not True: - document.meta["docname"] = trees[0].newdoc - - bundle, last_bundle_id = None, '' - for root in trees: - add_to_the_last_bundle = False - - if self.ignore_sent_id: - root._sent_id = None - elif root._sent_id is not None: - parts = root._sent_id.split('/', 1) - bundle_id = parts[0] - if len(parts) == 2: - root.zone = parts[1] - add_to_the_last_bundle = bundle_id == last_bundle_id - last_bundle_id = bundle_id - if self.zone != 'keep': - root.zone = self.zone - - # assign new/next bundle to `bundle` if needed - if not bundle or not add_to_the_last_bundle: - bundle = document.create_bundle() - if last_bundle_id != '': - bundle.bundle_id = last_bundle_id - - bundle.add_tree(root) - - self.next_filehandle() - if self.filehandle is None: - self.finished = True + while True: + try: + trees = self.read_trees() + except NotImplementedError: + return False + + document.meta['loaded_from'] = self.filename + document.meta['global.Entity'] = self._global_entity + if trees and trees[0].newdoc and trees[0].newdoc is not True: + document.meta["docname"] = trees[0].newdoc + + bundle, last_bundle_id = None, '' + for root in trees: + if root is None: + continue + if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return True + self._docs_loaded += 1 + add_to_the_last_bundle = False + + if self.ignore_sent_id: + root._sent_id = None + elif root._sent_id is not None: + parts = root._sent_id.split('/', 1) + bundle_id = parts[0] + if len(parts) == 2: + root.zone = parts[1] + add_to_the_last_bundle = bundle_id == last_bundle_id + last_bundle_id = bundle_id + if self.zone != 'keep': + root.zone = self.zone + + # assign new/next bundle to `bundle` if needed + if not bundle or not add_to_the_last_bundle: + bundle = document.create_bundle() + if last_bundle_id != '': + bundle.bundle_id = last_bundle_id + + bundle.add_tree(root) + + self.next_filehandle() + if self.filehandle is None: + self.finished = True + return True + if not self.merge: + return True return True # pylint: disable=too-many-branches,too-many-statements @@ -166,8 +197,12 @@ def process_document(self, document): if root._sent_id is not None: bundle.bundle_id = root._sent_id.split('/', 1)[0] bundle.add_tree(root) - if root.newdoc and root.newdoc is not True: - document.meta["docname"] = root.newdoc + if root.newdoc: + self._docs_loaded += 1 + if root.newdoc is not True: + document.meta["docname"] = root.newdoc + document.meta['global.Entity'] = self._global_entity + document.meta['loaded_from'] = self.filename filehandle = self.filehandle if filehandle is None: @@ -175,18 +210,32 @@ def process_document(self, document): if filehandle is None: self.finished = True return + logging.info(f"Reading {self.files.filename}") trees_loaded = 0 while True: root = self.filtered_read_tree() if root is None: - if trees_loaded == 0 and self.files.has_next_file(): + if (trees_loaded == 0 or self.merge) and self.files.has_next_file(): filehandle = self.next_filehandle() + logging.info(f"Reading {self.files.filename}") continue self.finished = not self.files.has_next_file() break if trees_loaded == 0: document.meta['loaded_from'] = self.filename + document.meta['global.Entity'] = self._global_entity + # Parameter max_docs is primarily aimed for counting UD docs, ie. trees with newdoc. + # However, it could be useful even when working with files without the newdoc annotations, + # e.g. when using files='!*.conllu' or bundles_per_doc, in which case we count the Udapi documents + # so even if the first tree in udapi.Document does not have newdoc, we count it as a new document. + # The cases where newdoc is used are checked further below. + if not root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return + self._docs_loaded += 1 + add_to_the_last_bundle = False trees_loaded += 1 @@ -205,6 +254,9 @@ def process_document(self, document): # The `# newdoc` comment in CoNLL-U marks a start of a new document. if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return if not bundle and root.newdoc is not True: document.meta["docname"] = root.newdoc if bundle and self.split_docs: @@ -214,6 +266,7 @@ def process_document(self, document): len(orig_bundles)) self.finished = False return + self._docs_loaded += 1 # assign new/next bundle to `bundle` if needed if not bundle or not add_to_the_last_bundle: @@ -256,3 +309,16 @@ def process_document(self, document): if gc_was_enabled: gc.enable() gc.collect() + + def read_documents(self): + """Load all documents of this reader and return them as a list.""" + # udapi.core.document imports udapi.block.read.conllu because of doc.load_conllu(filename) + # and udapi.block.read.conllu loads this module (udapi.core.basereader), + # so we cannot load udapi.core.document at the beginning of this module. + from udapi.core.document import Document + docs = [] + while not self.finished: + doc = Document() + self.apply_on_document(doc) + docs.append(doc) + return docs diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index 0db348a8..071ec124 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -1,6 +1,8 @@ """BaseWriter is the base class for all writer blocks.""" import sys import logging +import os +from pathlib import Path import udapi.core.coref from udapi.core.block import Block @@ -8,10 +10,10 @@ class BaseWriter(Block): - """Base class for all reader blocks.""" + """Base class for all writer blocks.""" def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding='utf-8', - newline='\n', overwrite=False, **kwargs): + newline='\n', overwrite=False, path=None, **kwargs): super().__init__(**kwargs) self.orig_files = files self.orig_stdout = sys.stdout @@ -29,6 +31,10 @@ def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding=' raise ValueError("overwrite=1 is not compatible with files=" + files) if overwrite and docname_as_file: raise ValueError("overwrite=1 is not compatible with docname_as_file=1") + # interpret path=my_dir/my_subdir as path=my_dir/my_subdir/ + if path and path[-1] != os.sep and '*' not in path: + path += os.sep + self.path = path @property def filename(self): @@ -45,7 +51,8 @@ def next_filename(self): return self.files.next_filename() def before_process_document(self, document): - udapi.core.coref.store_coref_to_misc(document) + if document: + udapi.core.coref.store_coref_to_misc(document) if self.orig_files == '': logging.info('Writing to filehandle.') sys.stdout = self.files.filehandle @@ -56,16 +63,28 @@ def before_process_document(self, document): docname = document.meta.get('docname', None) if docname is not None: logging.info('Writing to file %s.', docname) - sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) + sys.stdout = self._open(docname) else: logging.warning('docname_as_file=1 but the document contains no docname') - elif self.overwrite: + elif self.overwrite or self.path: docname = document.meta.get('loaded_from', None) if docname is not None: + if self.path: + old_dir, old_filename = os.path.split(docname) + new_dir, new_filename = os.path.split(self.path) + old_file, old_ext = os.path.splitext(old_filename) + new_file, new_ext = os.path.splitext(new_filename) + if new_dir in ('', '*'): + new_dir = old_dir + if new_file in ('', '*'): + new_file = old_file + if new_ext in ('', '*'): + new_ext = old_ext + docname = os.path.join(new_dir, new_file + new_ext) logging.info('Writing to file %s.', docname) - sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) + sys.stdout = self._open(docname) else: - logging.warning('overwrite=1 but documet.meta["loaded_from"] is None') + logging.warning('using overwrite or path but document.meta["loaded_from"] is None') else: sys.stdout = self.orig_stdout else: @@ -78,10 +97,13 @@ def before_process_document(self, document): sys.stdout = self.orig_stdout else: logging.info('Writing to file %s.', filename) - sys.stdout = open(filename, 'wt', encoding=self.encoding, newline=self.newline) + sys.stdout = self._open(filename) if old_filehandle not in (sys.stdout, self.orig_stdout): old_filehandle.close() + def _open(self, filename): + Path(filename).parent.mkdir(parents=True, exist_ok=True) + return open(filename, 'wt', encoding=self.encoding, newline=self.newline) def after_process_document(self, document): sys.stdout.flush() diff --git a/udapi/core/block.py b/udapi/core/block.py index 64b8bcc5..d293df61 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -1,6 +1,10 @@ """Block class represents the basic Udapi processing unit.""" import logging +import inspect +def not_overridden(method): + method.is_not_overridden = True + return method class Block(object): """The smallest processing unit for processing Universal Dependencies data. @@ -11,9 +15,23 @@ class Block(object): Possible values are: process (default), skip, skip_warn, fail, delete. """ - def __init__(self, zones='all', if_empty_tree='process'): + def __init__(self, zones='all', if_empty_tree='process', **kwargs): self.zones = zones self.if_empty_tree = if_empty_tree + if kwargs: + params = set() + for cls in type(self).mro()[:-1]: + params.update(inspect.signature(cls.__init__).parameters.keys()) + params -= {'self', 'kwargs'} + raise TypeError(f"Extra parameters {kwargs}.\n" + f"Parameters of {self.block_name()} are:\n" + + '\n'.join(sorted(params))) + + def block_name(self): + module = ".".join(self.__module__.split(".")[:-1]) + if module.startswith('udapi.block.'): + module = module[12:] + return module + "." + self.__class__.__name__ def process_start(self): """A hook method that is executed before processing UD data""" @@ -23,15 +41,27 @@ def process_end(self): """A hook method that is executed after processing all UD data""" pass + @not_overridden def process_node(self, _): """Process a UD node""" - raise Exception("No processing activity defined in block " + str(self)) + pass + @not_overridden + def process_empty_node(self, _): + """Process an empty node (in enhanced dependencies)""" + pass + + @not_overridden def process_tree(self, tree): """Process a UD tree""" - for node in tree._descendants: + # tree.descendants is slightly slower than tree._descendants (0.05s per iterating over 700k words), + # but it seems safer to iterate over a copy of the list of nodes. + # If a user calls parent.create_child().shift_before_node(parent) in process_node, + # it may end up in endless cycle (because the same node is processed again - Python for cycle remembers the position). + for node in tree.descendants: self.process_node(node) + @not_overridden def process_bundle(self, bundle): """Process a UD bundle""" for tree in bundle: @@ -50,10 +80,59 @@ def apply_on_document(self, document): def process_document(self, document): """Process a UD document""" - for bundle_no, bundle in enumerate(document.bundles, 1): - logging.debug('Block %s processing bundle #%d (id=%s)', - self.__class__.__name__, bundle_no, bundle.bundle_id) - self.process_bundle(bundle) + # Calling document.coref_entities is expensive because + # it needs to deserialize coref_entities from the MISC attributes. + # If no block in a scenario needs to process coreference entities/mentions, + # the deserialization does not need to be done. + # So we need to detect if any of the methods process_coref_entity and process_coref_mention + # has been overriden (without calling them, which could have adverse side effects). + # Let's use method annotations for this. + p_entity = not hasattr(self.process_coref_entity, 'is_not_overridden') + p_mention = not hasattr(self.process_coref_mention, 'is_not_overridden') + p_bundle = not hasattr(self.process_bundle, 'is_not_overridden') + p_tree = not hasattr(self.process_tree, 'is_not_overridden') + p_node = not hasattr(self.process_node, 'is_not_overridden') + p_empty_node = not hasattr(self.process_empty_node, 'is_not_overridden') + if not any((p_entity, p_mention, p_bundle, p_tree, p_node, p_empty_node)): + raise Exception("No processing activity defined in block " + self.block_name()) + + if p_entity or p_mention: + for entity in document.coref_entities: + if p_entity: + self.process_coref_entity(entity) + else: + for mention in entity.mentions: + self.process_coref_mention(mention) + + if p_bundle or p_tree or p_node or p_empty_node: + for bundle_no, bundle in enumerate(document.bundles, 1): + logging.debug(f'Block {self.block_name()} processing ' + f'bundle #{bundle_no} (id={bundle.bundle_id})') + if p_bundle: + self.process_bundle(bundle) + else: + for tree in bundle: + if self._should_process_tree(tree): + if p_tree: + self.process_tree(tree) + else: + if p_node: + for node in tree.descendants: + self.process_node(node) + if p_empty_node: + for empty_node in tree.empty_nodes: + self.process_empty_node(empty_node) + + @not_overridden + def process_coref_entity(self, entity): + """This method is called on each coreference entity in the document.""" + for mention in entity.mentions: + self.process_coref_mention(mention) + + @not_overridden + def process_coref_mention(self, mention): + """This method is called on each coreference mention in the document.""" + pass def before_process_document(self, document): """This method is called before each process_document.""" diff --git a/udapi/core/bundle.py b/udapi/core/bundle.py index 110ed42c..0a637f01 100644 --- a/udapi/core/bundle.py +++ b/udapi/core/bundle.py @@ -39,9 +39,9 @@ def bundle_id(self, bundle_id): tree._sent_id = bundle_id + '/' + tree.zone # pylint: disable=protected-access def __str__(self): - if self.bundle_id is None: + if self._bundle_id is None: return 'bundle without id' - return "bundle id='%s'" % self.bundle_id + return f"bundle id='{self._bundle_id}'" def __iter__(self): return iter(self.trees) @@ -72,7 +72,7 @@ def has_tree(self, zone=''): def create_tree(self, zone=None): """Return the root of a newly added tree with a given zone.""" root = Root() - root.zone = zone + root._zone = zone self.add_tree(root) return root @@ -89,8 +89,12 @@ def check_zone(self, new_zone): def add_tree(self, root): """Add an existing tree to the bundle.""" if root.zone is None: - root.zone = '' + root._zone = '' self.check_zone(root.zone) + if self._bundle_id: + root._sent_id = self._bundle_id + if root.zone: + root._sent_id += '/' + root.zone root.bundle = self self.trees.append(root) doc_json = root.json.get('__doc__') @@ -107,8 +111,17 @@ def remove(self): def address(self): """Return bundle_id or '?' if missing.""" - return self.bundle_id if self.bundle_id is not None else '?' + return self._bundle_id if self._bundle_id is not None else '?' def draw(self, **kwargs): """Pretty print the trees using TextModeTrees.""" TextModeTrees(**kwargs).process_bundle(self) + + @property + def nodes(self): + """An iterator over all nodes (excluding empty nodes) in all trees in this bundle.""" + for tree in self: + # tree.descendants is slightly slower than tree._descendants, + # but it seems safer, see the comment in udapi.core.block.Block.process_tree(). + for node in tree.descendants: + yield node diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 6236e4cf..aa27e6a7 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -1,42 +1,191 @@ -"""Classes for handling coreference.""" +"""Classes for handling coreference. + +# CorefUD 1.0 format implementation details + +## Rules for ordering "chunks" within `node.misc['Entity']` +Entity mentions are annotated using "chunks" stored in `misc['Entity']`. +Chunks are of three types: +1. opening bracket, e.g. `(e1-person` +2. closing bracket, e.g. `e1-person)` +3. single-word span (both opening and closing), e.g. `(e1-person)` + +The `Entity` MISC attribute contains a sequence of chunks +without any separators, e.g. `Entity=(e1-person(e2-place)` +means opening `e1` mention and single-word `e2` mention +starting on a given node. + +### Crossing mentions +Two mentions are crossing iff their spans have non-empty intersection, +but neither is a subset of the other, e.g. `e1` spanning nodes 1-3 +and `e2` spanning 2-4 would be represented as: +``` +1 ... Entity=(e1 +2 ... Entity=(e2 +3 ... Entity=e1) +4 ... Entity=e2) +``` +This may be an annotation error and we may forbid such cases in future annotation guidelines, +but in CorefUD 0.2, there are thousands of such cases (see https://github.com/ufal/corefUD/issues/23). + +It can even happen that one entity ends and another starts at the same node: `Entity=e1)(e2` +For this reason, we need + +**Rule1**: closing brackets MUST always precede opening brackets. +Otherwise, we would get `Entity=(e2e1)`, which could not be parsed. + +Note that we cannot have same-entity crossing mentions in the CorefUD 1.0 format, +so e.g. if we substitute `e2` with `e1` in the example above, we'll get +`(e1`, `e1)`, `(e1`, `e1)`, which will be interpreted as two non-overlapping mentions of the same entity. + +### Nested mentions +One mention (span) can be often embedded within another mention (span). +It can happen that both these mentions correspond to the same entity (i.e. are in the same cluster), +for example, "` sold the world>`". +It can even happen that both mentions start at the same node, e.g. "`< w3>`" (TODO: find nice real-world examples). +In such cases, we need to make sure the brackets are well-nested: + +**Rule2**: when opening multiple brackets at the same node, longer mentions MUST be opened first. + +This is important because +- The closing bracket has the same form for both mentions of the same entity - it includes just the entity ID (`eid`). +- The opening-bracket annotation contains other mention attributes, e.g. head index. +- The two mentions may differ in these attributes, e.g. the "``" mention's head may be w3. +- When breaking Rule2, we would get +``` +1 w1 ... Entity=(e1-person-1(e1-person-3 +2 w2 ... Entity=e1) +3 w3 ... Entity=e1) +``` +which would be interpreted as if the head of the "``" mention is its third word, which is invalid. + +### Other rules + +**Rule3**: when closing multiple brackets at the same node, shorter mentions SHOULD be closed first. +See Rule4 for a single exception from this rule regarding crossing mentions. +I'm not aware of any problems when breaking this rule, but it seems intuitive +(to make the annotation well-nested if possible) and we want to define some canonical ordering anyway. +The API should be able to load even files breaking Rule3. + +**Rule4**: single-word chunks SHOULD follow all opening brackets and precede all closing brackets if possible. +When considering single-word chunks as a subtype of both opening and closing brackets, +this rule follows from the well-nestedness (and Rule2). +So we should have `Entity=(e1(e2)` and `Entity=(e3)e1)`, +but the API should be able to load even `Entity=(e2)(e1` and `Entity=e1)(e3)`. + +In case of crossing mentions (annotated following Rule1), we cannot follow Rule4. +If we want to add a single-word mention `e2` to a node with `Entity=e1)(e3`, +it seems intuitive to prefer Rule2 over Rule3, which results in `Entity=e1)(e3(e2)`. +So the canonical ordering will be achieved by placing single-word chunks after all opening brackets. +The API should be able to load even `Entity=(e2)e1)(e3` and `Entity=e1)(e2)(e3`. + +**Rule5**: ordering of same-span single-word mentions +TODO: I am not sure here. We may want to forbid such cases or define canonical ordering even for them. +E.g. `Entity=(e1)(e2)` vs. `Entity=(e2)(e1)`. + +**Rule6**: ordering of same-start same-end multiword mentions +TODO: I am not sure here. +These can be either same-span multiword mentions (which may be forbidden) +or something like +``` +1 w1 ... Entity=(e1(e2[1/2]) +2 w2 ... +3 w3 ... Entity=(e2[2/2])e1) +``` +where both `e1` and `e2` start at w1 and end at w3, but `e2` is discontinuous and does not contain w2. +If we interpret "shorter" and "longer" in Rule2 and Rule3 as `len(mention.words)` +(and not as `mention.words[-1].ord - mention.words[0].ord`), +we get the canonical ordering as in the example above. + +""" import re import functools import collections +import collections.abc +import copy import logging +import bisect @functools.total_ordering class CorefMention(object): """Class for representing a mention (instance of an entity).""" - __slots__ = ['_head', '_cluster', '_bridging', '_words', 'misc'] + __slots__ = ['_head', '_entity', '_bridging', '_words', '_other'] - def __init__(self, head, cluster=None): - self._head = head - self._cluster = cluster - if cluster is not None: - cluster._mentions.append(self) + def __init__(self, words, head=None, entity=None, add_word_backlinks=True): + if not words: + raise ValueError("mention.words must be non-empty") + self._head = head if head else words[0] + self._entity = entity + if entity is not None: + entity._mentions.append(self) self._bridging = None - self._words = [] - self.misc = None + self._other = None + self._words = words + if add_word_backlinks: + for new_word in words: + if not new_word._mentions or not entity or self > new_word._mentions[-1]: + new_word._mentions.append(self) + else: + new_word._mentions.append(self) + new_word._mentions.sort() + + def _subspans(self): + mspan = self.span + if ',' not in mspan: + return [CorefMentionSubspan(self._words, self, '')] + root = self._words[0].root + subspans = mspan.split(',') + result = [] + for idx,subspan in enumerate(subspans, 1): + result.append(CorefMentionSubspan(span_to_nodes(root, subspan), self, f'[{idx}/{len(subspans)}]')) + return result - def __lt__(self, other): - """Does this mention precedes (word-order wise) the `other` mention? + def __lt__(self, another): + """Does this mention precedes (word-order wise) `another` mention? This method defines a total ordering of all mentions - (within one cluster or across different clusters). - The position is primarily defined by the first word in each mention - (or by the head if mention.words are missing). + (within one entity or across different entities). + The position is primarily defined by the first word in each mention. If two mentions start at the same word, - their order is defined by the last word in their span - -- the shorter mention precedes the longer one. + their order is defined by their length (i.e. number of words) + -- the shorter mention follows the longer one. + + In the rare case of two same-length mentions starting at the same word, but having different spans, + their order is defined by the order of the last word in their span. + For example precedes . + + The order of two same-span mentions is currently defined by their eid. + There should be no same-span (or same-subspan) same-entity mentions. """ - node1 = self._words[0] if self._words else self._head - node2 = other._words[0] if other._words else other._head - if node1 is node2: - node1 = self._words[-1] if self._words else self._head - node2 = other._words[-1] if other._words else other._head - if node1 is node2: - return len(self._words) < len(other._words) - return node1.precedes(node2) + #TODO: no mention.words should be handled already when loading + if not self._words: + self._words = [self._head] + if not another._words: + another._words = [another._head] + + if self._words[0] is another._words[0]: + if len(self._words) > len(another._words): + return True + if len(self._words) < len(another._words): + return False + if self._words[-1].precedes(another._words[-1]): + return True + if another._words[-1].precedes(self._words[-1]): + return False + return self._entity.eid < another._entity.eid + return self._words[0].precedes(another._words[0]) + + @property + def other(self): + if self._other is None: + self._other = OtherDualDict() + return self._other + + @other.setter + def other(self, value): + if self._other is None: + self._other = OtherDualDict(value) + else: + self._other.set_mapping(value) @property def head(self): @@ -49,15 +198,18 @@ def head(self, new_head): self._head = new_head @property - def cluster(self): - return self._cluster + def entity(self): + return self._entity - @cluster.setter - def cluster(self, new_cluster): - if self._cluster is not None: - raise NotImplementedError('changing the cluster of a mention not supported yet') - self._cluster = new_cluster - new_cluster._mentions.append(new_cluster) + @entity.setter + def entity(self, new_entity): + if self._entity is not None: + original_entity = self._entity + original_entity._mentions.remove(self) + if not original_entity._mentions: + logging.warning(f"Original entity {original_entity.eid} is now empty.") + self._entity = new_entity + bisect.insort(new_entity._mentions, self) @property def bridging(self): @@ -69,19 +221,25 @@ def bridging(self): @property def words(self): + # Words in a sentence could have been reordered, so we cannot rely on sorting self._words in the setter. + # The serialization relies on storing the opening bracket in the first word (and closing in the last), + # so we need to make sure the words are always returned sorted. + # TODO: benchmark updating the order of mention._words in node.shift_*() and node.remove(). + self._words.sort() return self._words @words.setter def words(self, new_words): if new_words and self.head not in new_words: - raise ValueError(f"Head {self.head} not in new_words {new_words}") + raise ValueError(f"Head {self.head} not in new_words {new_words} for {self._entity.eid}") kept_words = [] + # Make sure each word is included just once and they are in the correct order. + new_words = sorted(list(set(new_words))) for old_word in self._words: if old_word in new_words: kept_words.append(old_word) else: old_word._mentions.remove(self) - new_words.sort() self._words = new_words for new_word in new_words: if new_word not in kept_words: @@ -99,77 +257,134 @@ def span(self): def span(self, new_span): self.words = span_to_nodes(self._head.root, new_span) + def __str__(self): + """String representation of the CorefMention object: Mention.""" + return f"Mention<{self._entity._eid}: {self._head}>" + + def remove(self): + for word in self._words: + word._mentions.remove(self) + self._entity._mentions.remove(self) + + +@functools.total_ordering +class CorefMentionSubspan(object): + """Helper class for representing a continuous subspan of a mention.""" + __slots__ = ['words', 'mention', 'subspan_id'] + + def __init__(self, words, mention, subspan_id): + if not words: + raise ValueError("mention.words must be non-empty") + self.words = sorted(words) + self.mention = mention + self.subspan_id = subspan_id + + def __lt__(self, another): + if self.words[0] is another.words[0]: + if len(self.words) > len(another.words): + return True + if len(self.words) < len(another.words): + return False + return self.mention < another.mention + return self.words[0].precedes(another.words[0]) + + @property + def subspan_eid(self): + return self.mention._entity.eid + self.subspan_id + + +CHARS_FORBIDDEN_IN_ID = "-=| \t()" + @functools.total_ordering -class CorefCluster(object): +class CorefEntity(object): """Class for representing all mentions of a given entity.""" - __slots__ = ['_cluster_id', '_mentions', 'cluster_type', 'split_ante'] + __slots__ = ['_eid', '_mentions', 'etype', 'split_ante'] - def __init__(self, cluster_id, cluster_type=None): - self._cluster_id = cluster_id + def __init__(self, eid, etype=None): + self._eid = None # prepare the _eid slot + self.eid = eid # call the setter and check the ID is valid self._mentions = [] - self.cluster_type = cluster_type + self.etype = etype self.split_ante = [] - def __lt__(self, other): - """Does this CorefCluster precedes (word-order wise) the `other` cluster? + def __lt__(self, another): + """Does this CorefEntity precede (word-order wise) `another` entity? - This method defines a total ordering of all clusters - by the first mention of each cluster (see `CorefMention.__lt__`). - If one of the clusters has no mentions (which should not happen normally), + This method defines a total ordering of all entities + by the first mention of each entity (see `CorefMention.__lt__`). + If one of the entities has no mentions (which should not happen normally), there is a backup solution (see the source code). - If cluster IDs are not important, it is recommended to use block - `corefud.IndexClusters` to re-name cluster IDs in accordance with this cluster ordering. + If entity IDs are not important, it is recommended to use block + `corefud.IndexClusters` to re-name entity IDs in accordance with this entity ordering. """ - if not self._mentions or not other._mentions: - # Clusters without mentions should go first, so the ordering is total. - # If both clusters are missing mentions, let's use cluster_id, so the ordering is stable. - if not self._mentions and not other._mentions: - return self._cluster_id < other._cluster_id + if not self._mentions or not another._mentions: + # Entities without mentions should go first, so the ordering is total. + # If both entities are missing mentions, let's use eid, so the ordering is stable. + if not self._mentions and not another._mentions: + return self._eid < another._eid return not self._mentions - return self._mentions[0] < other._mentions[0] + return self._mentions[0] < another._mentions[0] @property - def cluster_id(self): - return self._cluster_id + def eid(self): + return self._eid + + @eid.setter + def eid(self, new_eid): + if any(x in new_eid for x in CHARS_FORBIDDEN_IN_ID): + raise ValueError(f"{new_eid} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]") + self._eid = new_eid + + @property + def eid_or_grp(self): + root = self._mentions[0].head.root + meta = root.document.meta + if 'GRP' in meta['global.Entity'] and meta['_tree2docid']: + docid = meta['_tree2docid'][root] + if self._eid.startswith(docid): + return self._eid.replace(docid, '', 1) + else: + logging.warning(f"GRP in global.Entity, but eid={self._eid} does not start with docid={docid}") + return self._eid @property def mentions(self): return self._mentions - def create_mention(self, head=None, mention_words=None, mention_span=None): - """Create a new CoreferenceMention object within this CorefCluster. + def create_mention(self, head=None, words=None, span=None): + """Create a new CoreferenceMention object within this CorefEntity. Args: head: a node where the annotation about this CorefMention will be stored in MISC. The head is supposed to be the linguistic head of the mention, i.e. the highest node in the dependency tree, but if such information is not available (yet), - it can be any node within the mention_words. - If no head is specified, the first word from mention_words will be used instead. - mention_words: a list of nodes of the mention. + it can be any node within the `words`. + If no head is specified, the first word from `words` will be used instead. + words: a list of nodes of the mention. This argument is optional, but if provided, it must contain the head. The nodes can be both normal nodes or empty nodes. - mention_span: an alternative way how to specify mention_words + span: an alternative way how to specify `words` using a string such as "3-5,6,7.1-7.2". (which means, there is an empty node 5.1 and normal node 7, which are not part of the mention). - At most one of the args mention_words and mention_span can be specified. + At most one of the args `words` and `span` can be specified. """ - if mention_words and mention_span: - raise ValueError("Cannot specify both mention_words and mention_span") - if head and mention_words and head not in mention_words: - raise ValueError(f"Head {head} is not among the specified mention_words") - if head is None and mention_words is None: - raise ValueError("Either head or mention_words must be specified") + if words and span: + raise ValueError("Cannot specify both words and span") + if head and words and head not in words: + raise ValueError(f"Head {head} is not among the specified words") + if head is None and words is None: + raise ValueError("Either head or words must be specified") if head is None: - head = mention_words[0] + head = words[0] - mention = CorefMention(head, self) - if mention_words: - mention.words = mention_words - if mention_span: - mention.span = mention_span + mention = CorefMention(words=[head], head=head, entity=self) + if words: + mention.words = words + if span: + mention.span = span self._mentions.sort() return mention @@ -180,8 +395,32 @@ def all_bridging(self): for b in m._bridging: yield b + def __str__(self): + """String representation of the CorefEntity object: Entity.""" + first_mention_head = self._mentions[0].head.form if self._mentions else "" + return f"Entity<{self._eid}: {first_mention_head}>" + + +# BridgingLink +# Especially the relation should be mutable, so we cannot use +# BridgingLink = collections.namedtuple('BridgingLink', 'target relation') +# TODO once dropping support for Python 3.6, we could use +# from dataclasses import dataclass +# @dataclass +# class DataClassCard: +# target: CorefEntity +# relation: str +class BridgingLink: + __slots__ = ['target', 'relation'] -BridgingLink = collections.namedtuple('BridgingLink', 'target relation') + def __init__(self, target, relation=''): + self.target = target + self.relation = '' if relation is None else relation + + def __lt__(self, another): + if self.target == another.target: + return self.relation < another.relation + return self.target < another.target class BridgingLinks(collections.abc.MutableSequence): @@ -189,33 +428,62 @@ class BridgingLinks(collections.abc.MutableSequence): Example usage: >>> bl = BridgingLinks(src_mention) # empty links - >>> bl = BridgingLinks(src_mention, [(c12, 'Part'), (c56, 'Subset')]) # from a list of tuples - >>> bl = BridgingLinks(src_mention, 'c12:Part,c56:Subset', clusters) # from a string - >>> for cluster, relation in bl: - >>> print(f"{bl.src_mention} ->{relation}-> {cluster.cluster_id}") - >>> print(str(bl)) # c12:Part,c56:Subset - >>> bl('Part').targets == [c12] - >>> bl('Part|Subset').targets == [c12, c56] - >>> bl.append((c89, 'Funct')) + >>> bl = BridgingLinks(src_mention, [(c12, 'part'), (c56, 'subset')]) # from a list of tuples + >>> (bl8, bl9) = BridgingLinks.from_string('c12>> for entity, relation in bl: + >>> print(f"{bl.src_mention} ->{relation}-> {entity.eid}") + >>> print(str(bl)) # c12>> bl('part').targets == [c12] + >>> bl('part|subset').targets == [c12, c56] + >>> bl.append((c57, 'funct')) """ - def __init__(self, src_mention, value=None, clusters=None, strict=True): + + @classmethod + def from_string(cls, string, entities, node, strict=True, tree2docid=None): + """Return a sequence of BridgingLink objects representing a given string serialization. + The bridging links are also added to the mentions (`mention.bridging`) in the supplied `entities`, + so the returned sequence can be usually ignored. + If `tree2docid` parameter is provided (mapping trees to document IDs used as prefixes in eid), + the entity IDs in the provided string are interpreted as "GRP", i.e. as document-wide IDs, + which need to be prefixed by the document IDs, to get corpus-wide unique "eid". + """ + src_str2bl = {} + for link_str in string.split(','): + try: + trg_str, src_str = link_str.split('<') + except ValueError as err: + _error(f"invalid Bridge {link_str} {err} at {node}", strict) + continue + relation = '' + if ':' in src_str: + src_str, relation = src_str.split(':', 1) + if trg_str == src_str: + _error(f"Bridge cannot self-reference the same entity {trg_str} at {node}", strict) + if tree2docid: + src_str = tree2docid[node.root] + src_str + trg_str = tree2docid[node.root] + trg_str + bl = src_str2bl.get(src_str) + if not bl: + bl = entities[src_str].mentions[-1].bridging + src_str2bl[src_str] = bl + if trg_str not in entities: + entities[trg_str] = CorefEntity(trg_str) + bl._data.append(BridgingLink(entities[trg_str], relation)) + return src_str2bl.values() + + def __init__(self, src_mention, value=None, strict=True): self.src_mention = src_mention self._data = [] self.strict = strict if value is not None: - if isinstance(value, str): - if clusters is None: - raise ValueError('BridgingClusters: clusters must be provided if initializing with a string') - try: - self._from_string(value, clusters) - except Exception: - logging.error(f"Problem when parsing {value} in {src_mention.words[0]}:\n") - raise - elif isinstance(value, collections.abc.Sequence): + if isinstance(value, collections.abc.Sequence): for v in value: - if v[0] is src_mention._cluster: - _error("Bridging cannot self-reference the same cluster: " + v[0].cluster_id, strict) + if v[0] is src_mention._entity: + _error("Bridging cannot self-reference the same entity: " + v[0].eid, strict) self._data.append(BridgingLink(v[0], v[1])) + else: + raise ValueError(f"Unknown value type: {type(value)}") + self.src_mention._bridging = self super().__init__() def __getitem__(self, key): @@ -226,31 +494,21 @@ def __len__(self): # TODO delete backlinks of old links, dtto for SplitAnte def __setitem__(self, key, new_value): - if new_value[0] is self.src_mention._cluster: - _error("Bridging cannot self-reference the same cluster: " + new_value[0].cluster_id, self.strict) + if new_value[0] is self.src_mention._entity: + _error("Bridging cannot self-reference the same entity: " + new_value[0].eid, self.strict) self._data[key] = BridgingLink(new_value[0], new_value[1]) def __delitem__(self, key): del self._data[key] def insert(self, key, new_value): - if new_value[0] is self.src_mention._cluster: - _error("Bridging cannot self-reference the same cluster: " + new_value[0].cluster_id, self.strict) + if new_value[0] is self.src_mention._entity: + _error("Bridging cannot self-reference the same entity: " + new_value[0].eid, self.strict) self._data.insert(key, BridgingLink(new_value[0], new_value[1])) def __str__(self): - return ','.join(f'{l.target._cluster_id}:{l.relation}' for l in sorted(self._data)) - - def _from_string(self, string, clusters): - self._data.clear() - for link_str in string.split(','): - target, relation = link_str.split(':') - if target == self.src_mention._cluster._cluster_id: - _error("Bridging cannot self-reference the same cluster: " + target, self.strict) - if target not in clusters: - clusters[target] = CorefCluster(target) - self._data.append(BridgingLink(clusters[target], relation)) - self._data.sort() + # TODO in future link.relation should never be None, 0 nor "_", so we could delete the below. + return ','.join(f'{l.target.eid_or_grp}<{self.src_mention.entity.eid_or_grp}{":" + l.relation if l.relation not in (None, "_", "") else ""}' for l in sorted(self._data)) def __call__(self, relations_re=None): """Return a subset of links contained in this list as specified by the args. @@ -259,139 +517,405 @@ def __call__(self, relations_re=None): """ if relations_re is None: return self - return Links(self.src_mention, [l for l in self._data if re.match(relations_re, l.relation)]) + return BridgingLinks(self.src_mention, [l for l in self._data if re.match(relations_re, l.relation)]) @property def targets(self): - """Return a list of the target clusters (without relations).""" + """Return a list of the target entities (without relations).""" return [link.target for link in self._data] + def _delete_targets_without_mentions(self, warn=True): + for link in self._data: + if not link.target.mentions: + if warn: + logging.warning(f"Entity {link.target.eid} has no mentions, but is referred to in bridging of {self.src_mention.entity.eid}") + self._data.remove(link) -def create_coref_cluster(head, cluster_id=None, cluster_type=None, **kwargs): - clusters = head.root.bundle.document.coref_clusters - if not cluster_id: - counter = 1 - while clusters.get('c%d' % counter): - counter += 1 - cluster_id = 'c%d' % counter - elif clusters.get(cluster_id): - raise ValueError("Cluster with a id %s already exists", cluster_id) - cluster = CorefCluster(cluster_id, cluster_type) - cluster.create_mention(head, **kwargs) - clusters[cluster_id] = cluster - return cluster def _error(msg, strict): if strict: raise ValueError(msg) logging.error(msg) + +RE_DISCONTINUOUS = re.compile(r'^([^[]+)\[(\d+)/(\d+)\]') +# When converting doc-level GRP IDs to corpus-level eid IDs, +# we need to assign each document a short ID/number (document names are too long). +# These document numbers must be unique even when loading multiple files, +# so we need to store the highest number generated so far here, at the Python module level. +highest_doc_n = 0 + def load_coref_from_misc(doc, strict=True): - clusters = {} + global highest_doc_n + entities = {} + unfinished_mentions = collections.defaultdict(list) + discontinuous_mentions = collections.defaultdict(list) + global_entity = doc.meta.get('global.Entity') + was_global_entity = True + if not global_entity: + was_global_entity = False + global_entity = 'eid-etype-head-other' + doc.meta['global.Entity'] = global_entity + tree2docid = None + if 'GRP' in global_entity: + tree2docid, docid = {}, "" + for bundle in doc: + for tree in bundle: + if tree.newdoc or docid == "": + highest_doc_n += 1 + docid = f"d{highest_doc_n}." + tree2docid[tree] = docid + doc.meta['_tree2docid'] = tree2docid + elif 'eid' not in global_entity: + raise ValueError("No eid in global.Entity = " + global_entity) + fields = global_entity.split('-') + for node in doc.nodes_and_empty: - index, index_str = 0, "" - cluster_id = node.misc["ClusterId"] - if not cluster_id: - index, index_str = 1, "[1]" - cluster_id = node.misc["ClusterId[1]"] - while cluster_id: - cluster = clusters.get(cluster_id) - if cluster is None: - cluster = CorefCluster(cluster_id) - clusters[cluster_id] = cluster - mention = CorefMention(node, cluster) - if node.misc["MentionSpan" + index_str]: - mention.span = node.misc["MentionSpan" + index_str] + misc_entity = node.misc["Entity"] + if not misc_entity: + continue + + if not was_global_entity: + raise ValueError(f"No global.Entity header found, but Entity= annotations are presents") + + # The Entity attribute may contain multiple entities, e.g. + # Entity=(abstract-7-new-2-coref(abstract-3-giv:act-1-coref) + # means a start of entity id=7 and start&end (i.e. single-word mention) of entity id=3. + # The following re.split line splits this into + # chunks = ["(abstract-7-new-2-coref", "(abstract-3-giv:act-1-coref)"] + chunks = [x for x in re.split(r'(\([^()]+\)?|[^()]+\))', misc_entity) if x] + for chunk in chunks: + opening, closing = (chunk[0] == '(', chunk[-1] == ')') + chunk = chunk.strip('()') + # 1. invalid + if not opening and not closing: + logging.warning(f"Entity {chunk} at {node} has no opening nor closing bracket.") + # 2. closing bracket + elif not opening and closing: + # closing brackets should include just the ID, but GRP needs to be converted to eid + if tree2docid: + # TODO delete this legacy hack once we don't need to load UD GUM v2.8 anymore + if '-' in chunk: + if not strict and global_entity.startswith('entity-GRP'): + chunk = chunk.split('-')[1] + else: + _error("Unexpected closing eid " + chunk, strict) + chunk = tree2docid[node.root] + chunk + + # closing discontinuous mentions + eid, subspan_idx = chunk, None + if chunk not in unfinished_mentions: + m = RE_DISCONTINUOUS.match(chunk) + if not m: + raise ValueError(f"Mention {chunk} closed at {node}, but not opened.") + eid, subspan_idx, total_subspans = m.group(1, 2, 3) + + try: + mention, head_idx = unfinished_mentions[eid].pop() + except IndexError as err: + raise ValueError(f"Mention {chunk} closed at {node}, but not opened.") + last_word = mention.words[-1] + if node.root is not last_word.root: + # TODO cross-sentence mentions + if strict: + raise ValueError(f"Cross-sentence mentions not supported yet: {chunk} at {node}") + else: + logging.warning(f"Cross-sentence mentions not supported yet: {chunk} at {node}. Deleting.") + entity = mention.entity + mention.words = [] + entity._mentions.remove(mention) + if not entity._mentions: + del entities[entity.eid] + for w in node.root.descendants_and_empty: + if last_word.precedes(w): + mention._words.append(w) + w._mentions.append(mention) + if w is node: + break + if head_idx and (subspan_idx is None or subspan_idx == total_subspans): + try: + mention.head = mention.words[head_idx - 1] + except IndexError as err: + _error(f"Invalid head_idx={head_idx} for {mention.entity.eid} " + f"closed at {node} with words={mention.words}", strict) + if not strict and head_idx > len(mention.words): + mention.head = mention.words[-1] + if subspan_idx and subspan_idx == total_subspans: + m = discontinuous_mentions[eid].pop() + if m is not mention: + _error(f"Closing mention {mention.entity.eid} at {node}, but it has unfinished nested mentions ({m.words})", 1) + + # 3. opening or single-word else: - mention.words = [node] - cluster_type = node.misc["ClusterType" + index_str] - if cluster_type is not None: - if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: - logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") - cluster.cluster_type = cluster_type - - bridging_str = node.misc["Bridging" + index_str] - if bridging_str: - mention._bridging = BridgingLinks(mention, bridging_str, clusters, strict) - - split_ante_str = node.misc["SplitAnte" + index_str] - if split_ante_str: - split_antes = [] - # TODO in CorefUD draft "+" was used as the separator, but it was changed to comma. - # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator. - for ante_str in split_ante_str.replace('+', ',').split(','): - if ante_str in clusters: - if ante_str == cluster_id: - _error("SplitAnte cannot self-reference the same cluster: " + cluster_id, strict) - split_antes.append(clusters[ante_str]) + eid, etype, head_idx, other = None, None, None, OtherDualDict() + for name, value in zip(fields, chunk.split('-')): + if name == 'eid': + eid = value + elif name == 'GRP': + eid = tree2docid[node.root] + value + elif name == 'etype' or name == 'entity': # entity is an old name for etype used in UD GUM 2.8 and 2.9 + etype = value + elif name == 'head': + try: + head_idx = int(value) + except ValueError as err: + _error(f"Non-integer {value} as head index in {chunk} in {node}: {err}", strict) + head_idx = 1 + elif name == 'other': + if other: + new_other = OtherDualDict(value) + for k,v in other.values(): + new_other[k] = v + other = new_other + else: + other = OtherDualDict(value) else: - # split cataphora, e.g. "We, that is you and me..." - ante_cl = CorefCluster(ante_str) - clusters[ante_str] = ante_cl - split_antes.append(ante_cl) - cluster.split_ante = sorted(split_antes) - - mention.misc = node.misc["MentionMisc" + index_str] - index += 1 - index_str = f"[{index}]" - cluster_id = node.misc["ClusterId" + index_str] - # c=doc.coref_clusters should be sorted, so that c[0] < c[1] etc. - # In other words, the dict should be sorted by the values (according to CorefCluster.__lt__), - # not by the keys (cluster_id). + other[name] = value + if eid is None: + raise ValueError("No eid in " + chunk) + subspan_idx, total_subspans = None, '0' + if eid[-1] == ']': + m = RE_DISCONTINUOUS.match(eid) + if not m: + _error(f"eid={eid} ending with ], but not valid discontinuous mention ID ", strict) + else: + eid, subspan_idx, total_subspans = m.group(1, 2, 3) + + entity = entities.get(eid) + if entity is None: + if subspan_idx and subspan_idx != '1': + _error(f'Non-first subspan of a discontinuous mention {eid} at {node} does not have any previous mention.', 1) + entity = CorefEntity(eid) + entities[eid] = entity + entity.etype = etype + elif etype and entity.etype and entity.etype != etype: + logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}") + other["orig_etype"] = etype + # CorefEntity could be created first with "Bridge=" without any type + elif etype and entity.etype is None: + entity.etype = etype + + if subspan_idx and subspan_idx != '1': + opened = [pair[0] for pair in unfinished_mentions[eid]] + mention = next(m for m in discontinuous_mentions[eid] if m not in opened) + mention._words.append(node) + if closing and subspan_idx == total_subspans: + m = discontinuous_mentions[eid].pop() + if m is not mention: + _error(f"{node}: closing mention {mention.entity.eid} ({mention.words}), but it has an unfinished nested mention ({m.words})", 1) + try: + mention.head = mention._words[head_idx - 1] + except IndexError as err: + _error(f"Invalid head_idx={head_idx} for {mention.entity.eid} " + f"closed at {node} with words={mention._words}", 1) + else: + mention = CorefMention(words=[node], entity=entity, add_word_backlinks=False) + if other: + mention._other = other + if subspan_idx: + discontinuous_mentions[eid].append(mention) + node._mentions.append(mention) + + if not closing: + unfinished_mentions[eid].append((mention, head_idx)) + + + # Bridge, e.g. Entity=(e12-event|Bridge=e12 (e10) + # (e1(e2 --> (e1(e2(e10) + # e3)(e1(e2 --> e3)(e1(e2(e10) + if not orig_entity or orig_entity[-1] != ')': + firstword.misc['Entity'] += mention_str + ')' + # e4)e3) --> (e10)e4)e3) + elif '(' not in orig_entity: + firstword.misc['Entity'] = mention_str + ')' + orig_entity + # (e9)e4)e3) --> (e10)(e9)e4)e3) + elif any(c and c[0] == '(' and c[-1] != ')' for c in re.split(r'(\([^()]+\)?|[^()]+\))', orig_entity)): + firstword.misc['Entity'] += mention_str + ')' + # (e1(e2(e9) --> (e1(e2(e9)(e10) + # e3)(e1(e2(e9)--> e3)(e1(e2(e9)(e10) else: - index, index_str = 1, "[1]" - while(head.misc["ClusterId" + index_str]): - index += 1 - index_str = f"[{index}]" - if index == 1: - index_str = "" - head.misc["ClusterId" + index_str] = cluster.cluster_id - head.misc["MentionSpan" + index_str] = mention.span - head.misc["ClusterType" + index_str] = cluster.cluster_type - if mention._bridging: - head.misc["Bridging" + index_str] = str(mention.bridging) - if cluster.split_ante: - serialized = ','.join((c.cluster_id for c in sorted(cluster.split_ante))) - head.misc["SplitAnte" + index_str] = serialized - if mention.misc: - head.misc["MentionMisc" + index_str] = mention.misc + firstword.misc['Entity'] = mention_str + ')' + orig_entity + # Second, multi-word mentions. Opening brackets should follow closing brackets. + else: + firstword.misc['Entity'] += mention_str + eid = entity.eid + if tree2docid and 'GRP' in fields: + eid = re.sub(r'^d\d+\.', '', eid) + mention.words[-1].misc['Entity'] = eid + ')' + mention.words[-1].misc['Entity'] + # Bridge=e1 lo else f"{lo}") return ','.join(ranges) + + +# TODO fix code duplication with udapi.core.dualdict after making sure benchmarks are not slower +class OtherDualDict(collections.abc.MutableMapping): + """OtherDualDict class serves as dict with lazily synchronized string representation. + + >>> ddict = OtherDualDict('anacata:anaphoric,antetype:entity,nptype:np') + >>> ddict['mention'] = 'np' + >>> str(ddict) + 'anacata:anaphoric,antetype:entity,mention:np,nptype:np' + >>> ddict['NonExistent'] + '' + + This class provides access to both + * a structured (dict-based, deserialized) representation, + e.g. {'anacata': 'anaphoric', 'antetype': 'entity'}, and + * a string (serialized) representation of the mapping, e.g. `anacata:anaphoric,antetype:entity`. + There is a clever mechanism that makes sure that users can read and write + both of the representations which are always kept synchronized. + Moreover, the synchronization is lazy, so the serialization and deserialization + is done only when needed. This speeds up scenarios where access to dict is not needed. + + A value can be deleted with any of the following three ways: + >>> del ddict['nptype'] + >>> ddict['nptype'] = None + >>> ddict['nptype'] = '' + and it works even if the value was already missing. + """ + __slots__ = ['_string', '_dict'] + + def __init__(self, value=None, **kwargs): + if value is not None and kwargs: + raise ValueError('If value is specified, no other kwarg is allowed ' + str(kwargs)) + self._dict = dict(**kwargs) + self._string = None + if value is not None: + self.set_mapping(value) + + def __str__(self): + if self._string is None: + serialized = [] + for name, value in sorted(self._dict.items(), key=lambda s: s[0].lower()): + if value is True: + serialized.append(name) + else: + serialized.append(f"{name}:{value}") + self._string = ','.join(serialized) if serialized else '' + return self._string + + def _deserialize_if_empty(self): + if not self._dict and self._string is not None and self._string != '': + for raw_feature in self._string.split(','): + namevalue = raw_feature.split(':', 1) + if len(namevalue) == 2: + name, value = namevalue + else: + name, value = namevalue[0], True + self._dict[name] = value + + def __getitem__(self, key): + self._deserialize_if_empty() + return self._dict.get(key, '') + + def __setitem__(self, key, value): + self._deserialize_if_empty() + self._string = None + if value is None or value == '': + self.__delitem__(key) + else: + value = value.replace(',', '%2C') # TODO report a warning? Escape also '|' and '-'? + self._dict[key] = value + + def __delitem__(self, key): + self._deserialize_if_empty() + try: + del self._dict[key] + self._string = None + except KeyError: + pass + + def __iter__(self): + self._deserialize_if_empty() + return self._dict.__iter__() + + def __len__(self): + self._deserialize_if_empty() + return len(self._dict) + + def __contains__(self, key): + self._deserialize_if_empty() + return self._dict.__contains__(key) + + def clear(self): + self._string = '_' + self._dict.clear() + + def copy(self): + """Return a deep copy of this instance.""" + return copy.deepcopy(self) + + def set_mapping(self, value): + """Set the mapping from a dict or string. + + If the `value` is None, it is converted to storing an empty string. + If the `value` is a string, it is stored as is. + If the `value` is a dict (or any instance of `collections.abc.Mapping`), + its copy is stored. + Other types of `value` raise an `ValueError` exception. + """ + if value is None: + self.clear() + elif isinstance(value, str): + self._dict.clear() + self._string = value + elif isinstance(value, collections.abc.Mapping): + self._string = None + self._dict = dict(value) + else: + raise ValueError("Unsupported value type " + str(value)) diff --git a/udapi/core/document.py b/udapi/core/document.py index f02f831e..5f2bdf0b 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -2,6 +2,7 @@ import io import contextlib +import logging import udapi.core.coref from udapi.core.bundle import Bundle from udapi.block.read.conllu import Conllu as ConlluReader @@ -22,18 +23,18 @@ def __init__(self, filename=None, **kwargs): No pre-processing is applied, so when loading the document from a *.txt file, `Document("a.txt").nodes` will be empty and you need to run tokenization first. You can pass additional parameters for `udapi.block.read.sentences` - (`ignore_empty_lines` and `rstrip`). + (`ignore_empty_lines`, `newdoc_if_empty_line` and `rstrip`). """ self.bundles = [] self._highest_bundle_id = 0 self.meta = {} self.json = {} - self._coref_clusters = None + self._eid_to_entity = None if filename is not None: if filename.endswith(".conllu"): self.load_conllu(filename, **kwargs) elif filename.endswith(".txt"): - reader = SentencesReader(files=filename, **kwargs) + reader = SentencesReader(files=[filename], **kwargs) reader.apply_on_document(self) else: raise ValueError("Only *.conllu and *.txt are supported. Provided: " + filename) @@ -44,6 +45,9 @@ def __iter__(self): def __getitem__(self, key): return self.bundles[key] + def __len__(self): + return len(self.bundles) + def __str__(self): """Pretty print the whole document using write.TextModeTrees.""" fh = io.StringIO() @@ -61,11 +65,11 @@ def create_bundle(self): def load_conllu(self, filename=None, **kwargs): """Load a document from a conllu-formatted file.""" - ConlluReader(files=filename, **kwargs).process_document(self) + ConlluReader(files=[filename], **kwargs).process_document(self) def store_conllu(self, filename): """Store a document into a conllu-formatted file.""" - ConlluWriter(files=filename).apply_on_document(self) + ConlluWriter(files=[filename]).apply_on_document(self) def from_conllu_string(self, string): """Load a document from a conllu-formatted string.""" @@ -91,7 +95,9 @@ def nodes(self): """An iterator over all nodes (excluding empty nodes) in the document.""" for bundle in self: for tree in bundle: - for node in tree._descendants: + # tree.descendants is slightly slower than tree._descendants, + # but it seems safer, see the comment in udapi.core.block.Block.process_tree(). + for node in tree.descendants: yield node @property @@ -107,17 +113,57 @@ def draw(self, **kwargs): TextModeTrees(**kwargs).run(self) def _load_coref(self): - """De-serialize coreference-related objects (CorefMention, CorefCluster). + """De-serialize coreference-related objects (CorefMention, CorefEntity). This internal method will be called automatically whenever any coref-related method is called. It iterates through all nodes in the document and creates the objects based on the info in MISC - (stored in attributes ClusterId, MentionSpan, ClusterType, Split, Bridging). + (stored in attributes Entity, SplitAnte, Bridge). """ - if self._coref_clusters is None: + if self._eid_to_entity is None: udapi.core.coref.load_coref_from_misc(self) + @property + def eid_to_entity(self): + """A dict mapping each eid (entity ID) to a CorefEntity object.""" + self._load_coref() + return self._eid_to_entity + @property def coref_clusters(self): - """A dict mapping ClusterId to a CorefCluster object.""" + """DEPRECATED: A dict mapping eid to a CorefEntity object. + + Substitute `doc.coref_clusters.values()` and `list(doc.coref_clusters.values())` + with `doc.coref_entities`. + Otherwise, substitute `doc.coref_clusters` with `doc.eid_to_entity`. + """ + logging.warning("coref_clusters is deprecated, use coref_entities or eid_to_entity instead.") + return self.eid_to_entity + + @property + def coref_entities(self): + """A list of all CorefEntity objects in the document.""" + self._load_coref() + return list(self._eid_to_entity.values()) + + @property + def coref_mentions(self): + """A sorted list of all CorefMention objects in the document.""" + self._load_coref() + all_mentions = [] + for entity in self._eid_to_entity.values(): + all_mentions.extend(entity.mentions) + all_mentions.sort() + return all_mentions + + def create_coref_entity(self, eid=None, etype=None): self._load_coref() - return self._coref_clusters + if not eid: + counter = 1 + while self._eid_to_entity.get(f'e{counter}'): + counter += 1 + eid = f'e{counter}' + elif self._eid_to_entity.get(eid): + raise ValueError("Entity with eid=%s already exists", eid) + entity = udapi.core.coref.CorefEntity(eid, etype) + self._eid_to_entity[eid] = entity + return entity diff --git a/udapi/core/dualdict.py b/udapi/core/dualdict.py index a79c0610..ba0129ed 100644 --- a/udapi/core/dualdict.py +++ b/udapi/core/dualdict.py @@ -45,7 +45,7 @@ def __str__(self): if value is True: serialized.append(name) else: - serialized.append('%s=%s' % (name, value)) + serialized.append(f"{name}={value}") self._string = '|'.join(serialized) if serialized else '_' return self._string diff --git a/udapi/core/files.py b/udapi/core/files.py index 7fcd9149..be59b2c0 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -58,14 +58,6 @@ def string_to_filenames(self, string): or commas. For specifying files with spaces or commas in filenames, you need to use wildcard patterns or '@' filelist. (But preferably don't use such filenames.) """ - # "!" means glob pattern which can contain {dir1,dir2} - # so it cannot be combined with separating tokens with comma. - if string[0] == '!': - pattern = string[1:] - filenames = glob.glob(pattern) - if not filenames: - raise RuntimeError('No filenames matched "%s" pattern' % pattern) - return filenames return list(itertools.chain.from_iterable(self._token_to_filenames(tok) for tok in string.replace(',', ' ').split())) @@ -73,7 +65,7 @@ def string_to_filenames(self, string): def _token_to_filenames(token): if token[0] == '!': pattern = token[1:] - filenames = glob.glob(pattern) + filenames = sorted(glob.glob(pattern)) if not filenames: raise RuntimeError('No filenames matched "%s" pattern' % pattern) elif token[0] == '@': diff --git a/udapi/core/mwt.py b/udapi/core/mwt.py index 684adfaf..3cc95cac 100644 --- a/udapi/core/mwt.py +++ b/udapi/core/mwt.py @@ -47,6 +47,20 @@ def address(self): """Full (document-wide) id of the multi-word token.""" return self.root.address + '#' + self.ord_range + @staticmethod + def is_mwt(): + """Is this a multi-word token? + + Returns always True. + False is returned only by instances of the Node class. + """ + return True + + @property + def no_space_after(self): + """Boolean property as a shortcut for `node.misc["SpaceAfter"] == "No"`.""" + return self.misc["SpaceAfter"] == "No" + # TODO: node.remove() should check if the node is not part of any MWT # TODO: Document that editing words by mwt.words.append(node), del or remove(node) is not supported # TODO: Make mwt._words private and provide a setter diff --git a/udapi/core/node.py b/udapi/core/node.py index 5225724e..3897c3a6 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -166,6 +166,14 @@ def sdeprel(self): return parts[1] return '' + @sdeprel.setter + def sdeprel(self, value): + udeprel = self.udeprel + if value is not None and value != '': + self.deprel = udeprel + ':' + value + else: + self.deprel = udeprel + @property def feats(self): """Property for morphological features stored as a `Feats` object. @@ -316,6 +324,8 @@ def parent(self, new_parent): # Check for None new_parent and cycles. if new_parent is None: raise ValueError(f'Cannot set None as parent: {self}') + if new_parent.is_empty(): + raise ValueError(f'Cannot set EmptyNode as parent in basic dependencies: {self}') if self is new_parent: raise CycleError('Cannot set a node as its own parent (cycle are forbidden): %s', self) if self._children and new_parent.is_descendant_of(self): @@ -357,10 +367,25 @@ def children(self): nodes2 = [n for n in node.children if n.ord > node.ord] nodes3 = [n for n in node.children if n.ord < node.ord] nodes4 = [n for n in node.children if n.ord < node.ord] + [node] - See documentation of ListOfNodes for details. + See the documentation of ListOfNodes for details. """ return ListOfNodes(self._children, origin=self) + @property + def siblings(self): + """Return a list of dependency sibling nodes. + + When used as a property, `node.siblings` is just a shortcut for: + [n for n in node.parent.children if n!=node] + However, it is especially helpful when used as a method, + so e.g. `node.siblings(preceding_only=True)` stands for + [n for n in node.parent.children if n.ord < node.ord] + which is something else than + node.parent.children(preceding_only=True). + See the documentation of ListOfNodes for details. + """ + return ListOfNodes([n for n in self._parent._children if n!=self], origin=self) + @property def descendants(self): """Return a list of all descendants of the current node. @@ -380,7 +405,7 @@ def descendants(self): nodes2 = [n for n in node.descendants if n.ord > node.ord] nodes3 = [n for n in node.descendants if n.ord < node.ord] nodes4 = [n for n in node.descendants if n.ord < node.ord] + [node] - See documentation of ListOfNodes for details. + See the documentation of ListOfNodes for details. """ # The following code is equivalent to # ListOfNodes(sorted(self.unordered_descendants()), origin=self) @@ -481,7 +506,7 @@ def is_empty(): return False def remove(self, children=None): - """Delete this node and all its descendants. + """Delete this node (and all its descendants unlsess specified otherwise). Args: children: a string specifying what to do if the node has any children. @@ -491,6 +516,8 @@ def remove(self, children=None): `rehang_warn` means to rehang and warn:-). """ self._parent._children.remove(self) + + # If there are any children, do the action specified in the "children" parameter. if children is not None and self._children: if children.startswith('rehang'): for child in self._children: @@ -511,18 +538,51 @@ def remove(self, children=None): self._root._descendants.remove(self) except ValueError: pass # self may be an already deleted node e.g. if n.remove() called twice - for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): - node.ord = new_ord + else: + for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): + node.ord = new_ord + last_ord = 0 + for empty in self._root.empty_nodes: + if empty._ord > self._ord: + new_ord = round(empty._ord - 1, 1) + if new_ord <= last_ord: + new_ord = round(last_ord + 0.1, 1) + empty.ord = new_ord + last_ord = empty._ord else: + # Remember the position of empty nodes, so we can reorder them as well. + empty_follows = None + if self._root.empty_nodes: + will_be_removed = self if children and children.startswith('rehang') else self.descendants(add_self=1) + prev_nonempty = self._root + empty_follows = {} + for node in self._root.descendants_and_empty: + if node.is_empty(): + empty_follows[node] = prev_nonempty + elif node not in will_be_removed: + prev_nonempty = node + # TODO nodes_to_remove = self.unordered_descendants() # and mark all nodes as deleted, remove them from MWT and coref mentions self._root._descendants = sorted(self._root.unordered_descendants()) for (new_ord, node) in enumerate(self._root._descendants, 1): node.ord = new_ord + # Decrease ord of empty nodes (keep their fractional part) + # Make sure that e.g. after deleting node with ord=2 + # ords "1 1.1 1.2 2 2.1" will become "1 1.1 1.2 1.3". + if empty_follows: + last_ord = 0 + for empty in self._root.empty_nodes: + prev_nonempty = empty_follows[empty] + new_ord = round(prev_nonempty._ord + (empty._ord % 1), 1) + while new_ord <= last_ord: + new_ord = round(new_ord + 0.1, 1) + last_ord, empty.ord = new_ord, new_ord def _shift_before_ord(self, reference_ord, without_children=False): """Internal method for changing word order.""" all_nodes = self._root._descendants + empty_nodes = self._root.empty_nodes # Moving a single node can be faster than nodes_to_move = [self] if without_children or not self._children: @@ -533,14 +593,25 @@ def _shift_before_ord(self, reference_ord, without_children=False): all_nodes[i_ord - 1]._ord = i_ord all_nodes[reference_ord - 2] = self self._ord = reference_ord - 1 + for en in empty_nodes: + if en._ord > my_ord and en._ord < reference_ord: + en._ord -= 1 elif reference_ord < my_ord: for i_ord in range(my_ord, reference_ord, -1): all_nodes[i_ord - 1] = all_nodes[i_ord - 2] all_nodes[i_ord - 1]._ord = i_ord all_nodes[reference_ord - 1] = self self._ord = reference_ord + for en in empty_nodes: + # Empty nodes before the first overt token (ID=0.X) will be never moved this way. + # We cannot know whether the caller wanted to place the shifted node before or after them. + if en._ord < my_ord and en._ord > reference_ord: + en._ord += 1 + self._parent._children.sort() return + #TODO: Updating ords of empty nodes is implemented only for the simple case above, + # but it has to be implemented also for the complex case below! nodes_to_move = self.descendants(add_self=True) first_ord, last_ord = nodes_to_move[0]._ord, nodes_to_move[-1]._ord @@ -564,6 +635,7 @@ def _shift_before_ord(self, reference_ord, without_children=False): for node in nodes_to_move: all_nodes[trg_ord - 1], node._ord = node, trg_ord trg_ord += 1 + self._parent._children.sort() return # First, move a node from position src_ord to position trg_ord RIGHT-ward. @@ -597,6 +669,7 @@ def _shift_before_ord(self, reference_ord, without_children=False): for node in nodes_to_move: all_nodes[trg_ord - 1], node._ord = node, trg_ord trg_ord += 1 + self._parent._children.sort() def shift_after_node(self, reference_node, without_children=False, skip_if_descendant=False): """Shift this node after the reference_node.""" @@ -607,7 +680,9 @@ def shift_after_node(self, reference_node, without_children=False, skip_if_desce self._shift_before_ord(reference_node._ord + 1, without_children=without_children) def shift_before_node(self, reference_node, without_children=False, skip_if_descendant=False): - """Shift this node after the reference_node.""" + """Shift this node before the reference_node.""" + if reference_node.is_root(): + raise ValueError(f'Cannot shift a node before the root ({reference_node})') if not without_children and reference_node.is_descendant_of(self): if skip_if_descendant: return @@ -636,6 +711,8 @@ def shift_before_subtree(self, reference_node, without_children=0, skip_if_desce Args: without_children: shift just this node without its subtree? """ + if reference_node.is_root(): + raise ValueError(f'Cannot shift a node before the root ({reference_node})') if not without_children and reference_node.is_descendant_of(self): if skip_if_descendant: return @@ -677,6 +754,8 @@ def precedes(self, node): return self._ord < node._ord if self._root._zone != node._root._zone: raise ValueError(f"Cannot compare word order across zones: {self} {node}") + if self._root._bundle._document is not node._root._bundle._document: + raise ValueError(f"Cannot compare word order across documents: {self} {node}") return self._root._bundle.number < node._root._bundle.number def is_leaf(self): @@ -685,21 +764,21 @@ def is_leaf(self): def _get_attr(self, name): # pylint: disable=too-many-return-statements if name == 'dir': - if self._parent.is_root(): + if not self._parent or self._parent.is_root(): return 'root' return 'left' if self.precedes(self._parent) else 'right' if name == 'edge': - if self._parent.is_root(): + if not self._parent or self._parent.is_root(): return 0 return self._ord - self._parent._ord if name == 'children': return len(self._children) if name == 'siblings': - return len(self._parent._children) - 1 + return 0 if not self._parent else len(self._parent._children) - 1 if name == 'depth': value = 0 tmp = self - while not tmp.is_root(): + while tmp and not tmp.is_root(): tmp = tmp._parent value += 1 return value @@ -831,6 +910,18 @@ def multiword_token(self): """ return self._mwt + @property + def words(self): + """Return one-item list with this node. + + This property is there for compatibility with udapi.core.mwt.MWT.words. + So that it is possible to use code such as: + for token in root.token_descendants: + words = token.words + ... + """ + return [self] + def is_nonprojective(self): """Is the node attached to its parent non-projectively? @@ -902,13 +993,25 @@ def coref_mentions(self): return self._mentions @property - def coref_clusters(self): + def coref_entities(self): self._root.bundle.document._load_coref() - return [m.cluster for m in self._mentions if m.cluster is not None] + return [m.entity for m in self._mentions if m.entity is not None] - def create_coref_cluster(self, **kwargs): - return udapi.core.coref.create_coref_cluster(head=self, **kwargs) + # TODO: is this method useful? + def create_coref_entity(self, eid=None, etype=None, **kwargs): + doc = self._root.bundle.document + entity = doc.create_coref_entity(eid, etype) + entity.create_mention(head=self, **kwargs) + return entity + + @staticmethod + def is_mwt(): + """Is this a multi-word token? + Returns False for all Node instances. + True is returned only by instances of the MWT class. + """ + return False class CycleError(Exception): '''A cycle in the dependency tree detected (or would be created).''' @@ -960,6 +1063,19 @@ def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): """Attempts at changing the word order of EmptyNode result in NotImplemented exception.""" raise NotImplemented('Empty nodes cannot be re-order using shift* methods yet.') + def remove(self): + """Delete this empty node.""" + to_reorder = [e for e in self._root.empty_nodes if e._ord > self._ord and e._ord < self.ord+1] + for empty in to_reorder: + empty._ord = round(empty._ord - 0.1, 1) + try: + self._root.empty_nodes.remove(self) + except ValueError: + return # self may be an already deleted node e.g. if n.remove() called twice + for n in self._root.empty_nodes + self._root._descendants: + if n._deps: + n._deps = {(deprel, parent) for deprel, parent in n._deps if parent != self} + @functools.total_ordering class OrdTuple: """Class for the rare case of 9+ consecutive empty nodes, i.e. ords x.10, x.11 etc. diff --git a/udapi/core/resource.py b/udapi/core/resource.py index 9e5923f1..da2ba561 100644 --- a/udapi/core/resource.py +++ b/udapi/core/resource.py @@ -2,6 +2,7 @@ import logging import urllib.request import os +from os.path import expanduser BASEURL = 'http://ufallab.ms.mff.cuni.cz/tectomt/share/data/' @@ -11,8 +12,10 @@ def require_file(path): if not os.path.isfile(path): raise IOError(path + " does not exist") return os.path.abspath(path) - udapi_data = os.environ.get('UDAPI_DATA', os.environ.get('HOME')) - full_path = udapi_data + '/' + path + udapi_data = os.environ.get('UDAPI_DATA', expanduser('~')) + if udapi_data is None: + raise IOError(f"Empty environment vars: UDAPI_DATA={os.environ.get('UDAPI_DATA')} HOME={expanduser('~')}") + full_path = os.path.join(udapi_data, path) if not os.path.isfile(full_path): logging.info('Downloading %s to %s', BASEURL + path, full_path) os.makedirs(os.path.dirname(full_path), exist_ok=True) diff --git a/udapi/core/root.py b/udapi/core/root.py index 3fbe5fca..d5e0f4a8 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -71,6 +71,10 @@ def address(self): """ return self.sent_id + @property + def document(self): + return self._bundle._document + @property def bundle(self): """Return the bundle which this tree belongs to.""" @@ -91,6 +95,13 @@ def zone(self, zone): if self._bundle: self._bundle.check_zone(zone) self._zone = zone + slashzone = '/' + zone if zone else '' + if self._bundle is not None: + self._sent_id = self._bundle.address() + slashzone + elif self._sent_id: + self._sent_id = self._sent_id.split('/', 1)[0] + slashzone + else: + self._sent_id = '?' + slashzone @property def parent(self): @@ -133,7 +144,7 @@ def remove(self, children=None): The default (None) is to delete them (and all their descendants). `warn` means to issue a warning. """ - if children is not None and self.children: + if children is not None and self._children: logging.warning('%s is being removed by remove(children=%s), ' ' but it has (unexpected) children', self, children) self.bundle.trees = [root for root in self.bundle.trees if root != self] @@ -165,8 +176,18 @@ def create_multiword_token(self, words=None, form=None, misc=None): form: string representing the surface form of the new MWT misc: misc attribute of the new MWT """ + # Nested or overlapping MWTs are not allowed in CoNLL-U, + # so first remove all previous MWTs containing any of words. + for w in words: + if w.multiword_token: + w.multiword_token.remove() + # Now, create the new MWT. mwt = MWT(words, form, misc, root=self) self._mwts.append(mwt) + if words[-1].misc["SpaceAfter"] == "No": + mwt.misc["SpaceAfter"] = "No" + for word in words: + word.misc["SpaceAfter"] = "" return mwt @property @@ -257,10 +278,10 @@ def steal_nodes(self, nodes): node.ord = new_ord node._root = self if not whole_tree: - for child in [n for n in node.children if n not in nodes]: + for child in [n for n in node._children if n not in nodes]: child._parent = old_root - old_root._children = sorted(old_root.children + [child]) - node._children = [n for n in node.children if n in nodes] + old_root._children = sorted(old_root._children + [child]) + node._children = [n for n in node._children if n in nodes] if node.parent == old_root or (not whole_tree and node.parent not in nodes): node.parent._children = [n for n in node.parent._children if n != node] node._parent = self @@ -279,3 +300,39 @@ def steal_nodes(self, nodes): self.create_multiword_token(words=words, form=mwt.form, misc=mwt.misc) self._descendants += nodes # pylint: enable=protected-access + + def flatten(self, deprel='root'): + """Flatten the tree (i.e. attach all nodes to the root) and reset all deprels. + + This is equivalent to + for node in root.descendants: + node.parent = root + node.deprel = 'root' + but it is faster. + """ + self._children = self._descendants[:] + for node in self._children: + node._parent = self + node._children.clear() + + @property + def prev_tree(self): + """Return the previous tree (root) in the document (from the same zone).""" + doc = self._bundle._document + num = self._bundle.number + if len(doc.bundles) <= num - 1 or doc.bundles[num - 1] is not self._bundle: + num = doc.bundles.index(self._bundle) + 1 + if num == 1: + return None + return doc.bundles[num - 2].get_tree(zone=self._zone) + + @property + def next_tree(self): + """Return the next tree (root) in the document (from the same zone).""" + doc = self._bundle._document + num = self._bundle.number + if len(doc.bundles) <= num - 1 or doc.bundles[num - 1] is not self._bundle: + num = doc.bundles.index(self._bundle) + 1 + if len(doc.bundles) <= num: + return None + return doc.bundles[num].get_tree(zone=self._zone) diff --git a/udapi/core/run.py b/udapi/core/run.py index 0a08504c..8ac63e1e 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -67,6 +67,30 @@ def _parse_command_line_arguments(scenario): return block_names, block_args +def _blocks_in_a_package(package_name): + import importlib.util, pkgutil + + if not importlib.util.find_spec(package_name): + return [] + try: + package = __import__(package_name, fromlist="dummy") + submodule_names = [m.name for m in pkgutil.iter_modules(package.__path__)] + pname = package_name + if pname.startswith("udapi.block."): + pname = pname[12:] + blocks = [] + for sname in submodule_names: + try: # ignore modules with compilation errors + module = __import__(f"{package_name}.{sname}", fromlist="dummy") + bnames = [c for c in dir(module) if c.lower() == sname] + if bnames: + blocks.append(f"{pname}.{bnames[0]}") + except: + pass + return blocks + except: + return [] + def _import_blocks(block_names, block_args): """ Parse block names, import particular packages and call the constructor for each object. @@ -92,8 +116,17 @@ def _import_blocks(block_names, block_args): command = "from " + module + " import " + class_name + " as b" + str(block_id) logging.debug("Trying to run command: %s", command) exec(command) # pylint: disable=exec-used - except Exception: - logging.warning("Error when trying import the block %s", block_name) + except ModuleNotFoundError as err: + package_name = ".".join(module.split(".")[:-1]) + package_blocks = _blocks_in_a_package(package_name) + if not package_blocks: + raise + raise ModuleNotFoundError( + f"Cannot find block {block_name} (i.e. class {module}.{class_name})\n" + f"Available block in {package_name} are:\n" + + "\n".join(package_blocks)) from err + except Exception as ex: + logging.warning(f"Cannot import block {block_name} (i.e. class {module}.{class_name})") raise # Run the imported module. @@ -101,7 +134,8 @@ def _import_blocks(block_names, block_args): command = "b%s(**kwargs)" % block_id logging.debug("Trying to evaluate this: %s", command) new_block_instance = eval(command) # pylint: disable=eval-used - blocks.append(new_block_instance) + args = ' '.join(f"{k}={v}" for k,v in kwargs.items()) + blocks.append((block_name, new_block_instance, args)) return blocks @@ -132,12 +166,15 @@ def execute(self): # Import blocks (classes) and construct block instances. blocks = _import_blocks(block_names, block_args) + return self.run_blocks(blocks) + + def run_blocks(self, blocks): # Initialize blocks (process_start). - for block in blocks: + for _, block, _ in blocks: block.process_start() readers = [] - for block in blocks: + for _, block, _ in blocks: try: block.finished # pylint: disable=pointless-statement readers.append(block) @@ -147,15 +184,15 @@ def execute(self): logging.info('No reader specified, using read.Conllu') conllu_reader = Conllu() readers = [conllu_reader] - blocks = readers + blocks + blocks = [('read.Conllu', conllu_reader, {})] + blocks # Apply blocks on the data. finished = False while not finished: document = Document() logging.info(" ---- ROUND ----") - for block in blocks: - logging.info("Executing block " + block.__class__.__name__) + for bname, block, args in blocks: + logging.info(f"Executing block {bname} {args}") block.apply_on_document(document) finished = True @@ -164,9 +201,12 @@ def execute(self): finished = finished and reader.finished # 6. close blocks (process_end) - for block in blocks: + for _, block, _ in blocks: block.process_end() + # Some users may use the block instances (e.g. to retrieve some variables). + return blocks + # TODO: better implementation, included Scen def scenario_string(self): """Return the scenario string.""" @@ -176,4 +216,4 @@ def scenario_string(self): def create_block(block, **kwargs): """A factory function for creating new block instances (handy for IPython).""" blocks = _import_blocks([block], [kwargs]) - return blocks[0] + return blocks[0][1] diff --git a/udapi/core/tests/__init__.py b/udapi/core/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/core/tests/data/fr-democrat-dev-sample.conllu b/udapi/core/tests/data/fr-democrat-dev-sample.conllu new file mode 100644 index 00000000..b3a85f80 --- /dev/null +++ b/udapi/core/tests/data/fr-democrat-dev-sample.conllu @@ -0,0 +1,60 @@ +# newdoc id = ungroupped-estrepublicain-2-066 +# global.Entity = eid-etype-head-other +# newpar id = ungroupped-estrepublicain-2-066-p0 +# sent_id = ungroupped-estrepublicain-2-066-p0-s1 +# text = Les allocations de décembre arrivent ! +1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ Entity=(e36772--2 +2 allocations allocation NOUN _ Gender=Fem|Number=Plur 5 nsubj _ _ +3 de de ADP _ _ 4 case _ _ +4 décembre décembre NOUN _ Gender=Masc|Number=Sing 2 nmod _ Entity=e36772) +5 arrivent arriver VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ +6 ! ! PUNCT _ _ 5 punct _ _ + +# newpar id = ungroupped-estrepublicain-2-066-p1 +# sent_id = ungroupped-estrepublicain-2-066-p1-s1 +# text = La Caisse d' Allocations familiales du Territoire de Belfort informe ses allocataires que le montant des prestations sera disponible sur les comptes bancaires ou postaux à partir du 8 janvier . +1 La le DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 2 det _ Entity=(e36773--2 +2 Caisse caisse NOUN _ Gender=Fem|Number=Sing 11 nsubj _ Entity=(e36774-organization-1 +3 d' de ADP _ _ 4 case _ _ +4 Allocations Allocations NOUN _ Gender=Fem|Number=Plur 2 nmod _ _ +5 familiales familial ADJ _ Gender=Fem|Number=Plur 4 amod _ _ +6-7 du _ _ _ _ _ _ _ _ +6 de de ADP _ _ 8 case _ _ +7 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 8 det _ Entity=(e36775--2 +8 Territoire territoire NOUN _ Gender=Masc|Number=Sing 2 nmod _ _ +9 de de ADP _ _ 10 case _ _ +10 Belfort Belfort PROPN _ _ 8 nmod _ Entity=e36775)e36774)e36773) +11 informe informer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ +12 ses son DET _ Gender=Masc|Number=Plur|Poss=Yes|PronType=Prs 13 det _ Entity=(e36776--2(e36773--1) +13 allocataires allocataire NOUN _ Gender=Masc|Number=Plur 11 obj _ Entity=e36776) +14 que que PRON _ PronType=Rel 21 mark _ _ +15 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 16 det _ Entity=(e36777--2 +16 montant montant NOUN _ Gender=Masc|Number=Sing 21 nsubj _ _ +17-18 des _ _ _ _ _ _ _ _ +17 de de ADP _ _ 19 case _ _ +18 les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 19 det _ Entity=(e36778--2 +19 prestations prestation NOUN _ Gender=Fem|Number=Plur 16 nmod _ Entity=e36778)e36777) +20 sera être AUX _ Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin 21 cop _ _ +21 disponible disponible ADJ _ Gender=Fem|Number=Sing 11 advcl _ _ +22 sur sur ADP _ _ 24 case _ _ +23 les le DET _ Definite=Def|Gender=Masc|Number=Plur|PronType=Art 24 det _ Entity=(e36779--2 +24 comptes compte NOUN _ Gender=Masc|Number=Plur 21 obl _ _ +25 bancaires bancaire ADJ _ Gender=Masc|Number=Plur 24 amod _ _ +26 ou ou CCONJ _ _ 27 cc _ _ +27 postaux postal ADJ _ Gender=Masc|Number=Plur 25 conj _ Entity=e36779) +28 à à ADP _ _ 33 case _ _ +29 partir partir VERB _ VerbForm=Inf 28 fixed _ _ +30-31 du _ _ _ _ _ _ _ _ +30 de de ADP _ _ 28 fixed _ _ +31 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 33 det _ Entity=(e36780--3 +32 8 8 NUM _ _ 33 nummod _ _ +33 janvier janvier NOUN _ Gender=Masc|Number=Sing 21 obl _ Entity=e36780) +34 . . PUNCT _ _ 11 punct _ _ + +# newdoc id = ungroupped-estrepublicain-2-005 +# global.Entity = eid-etype-head-other +# newpar id = ungroupped-estrepublicain-2-005-p0 +# sent_id = ungroupped-estrepublicain-2-005-p0-s1 +# text = Vitry-le-François +1 Vitry-le-François Vitry-le-François PROPN _ _ 0 root _ Entity=(e36781-place-1) + diff --git a/udapi/core/tests/test_coref.py b/udapi/core/tests/test_coref.py new file mode 100755 index 00000000..8952d6d8 --- /dev/null +++ b/udapi/core/tests/test_coref.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +import os +import unittest +import udapi +from udapi.block.read.conllu import Conllu as ConlluReader + + +class TestCoref(unittest.TestCase): + + def test_load(self): + data_filename = os.path.join(os.path.dirname(__file__), 'data', 'fr-democrat-dev-sample.conllu') + reader = ConlluReader(files=data_filename, split_docs=True) + docs = reader.read_documents() + self.assertEqual(len(docs), 2) + docs[-1].draw() + coref_entities = docs[-1].coref_entities + self.assertEqual(len(coref_entities), 1) + self.assertEqual(coref_entities[0].eid, 'e36781') + node = next(docs[-1].nodes) + self.assertEqual(len(node.coref_entities), 1) + self.assertEqual(len(node.coref_mentions), 1) + self.assertEqual(node.coref_entities[0], coref_entities[0]) + self.assertEqual(docs[-1].meta["loaded_from"], data_filename) + + def test_edits(self): + data_filename = os.path.join(os.path.dirname(__file__), 'data', 'fr-democrat-dev-sample.conllu') + doc = udapi.Document(data_filename) + first_node = next(doc.nodes) + second_node = first_node.next_node + new_entity = doc.create_coref_entity(etype='person') + self.assertEqual(new_entity.etype, 'person') + self.assertEqual(len(new_entity.mentions), 0) + m1 = new_entity.create_mention(words=[first_node]) # head will be automatically set to words[0] + self.assertEqual(len(new_entity.mentions), 1) + self.assertEqual(m1, new_entity.mentions[0]) + self.assertEqual(m1.entity, new_entity) + self.assertEqual(m1.head, first_node) + self.assertEqual(m1.words, [first_node]) + self.assertEqual(m1.span, '1') + m1.words = [second_node, first_node, first_node] # intentional duplicates and wrong order + self.assertEqual(m1.words, [first_node, second_node]) + self.assertEqual(m1.span, '1-2') + m1.head = second_node + self.assertEqual(m1.head, second_node) + m2 = new_entity.create_mention(head=second_node, span='1-3') # mention.words will be filled according to the span + self.assertEqual(len(new_entity.mentions), 2) + self.assertEqual(new_entity.mentions[0], m2) # 1-3 should go before 1-2 + self.assertEqual(new_entity.mentions[1], m1) + self.assertTrue(m2 < m1) + self.assertEqual(m2.words, [first_node, second_node, second_node.next_node]) + entity2 = doc.create_coref_entity() + m1.entity = entity2 + self.assertEqual(m1.entity.eid, entity2.eid) + m2.entity = entity2 + self.assertEqual(m2.entity.eid, entity2.eid) + + +if __name__ == "__main__": + unittest.main() diff --git a/udapi/core/tests/test_document.py b/udapi/core/tests/test_document.py index 66363ca9..28283dda 100755 --- a/udapi/core/tests/test_document.py +++ b/udapi/core/tests/test_document.py @@ -9,12 +9,15 @@ class TestDocument(unittest.TestCase): def test_init(self): doc = Document() - def test_iterator(self): + def test_ids(self): doc = Document() - doc.bundles = ['a', 'b', 'c'] - for bundle in doc: - print(bundle) - + bundle1 = doc.create_bundle() + bundle2 = doc.create_bundle() + self.assertEqual(bundle1.address(), "1") + self.assertEqual(bundle2.address(), "2") + self.assertEqual([b.bundle_id for b in doc], ["1", "2"]) + tree1 = bundle1.create_tree() + self.assertEqual(tree1.address(), "1") if __name__ == "__main__": unittest.main() diff --git a/udapi/core/tests/test_node.py b/udapi/core/tests/test_node.py index f38ca585..83348c67 100755 --- a/udapi/core/tests/test_node.py +++ b/udapi/core/tests/test_node.py @@ -36,6 +36,8 @@ def test_topology(self): self.assertEqual(len(nodes[1].children), 3) self.assertEqual(len(nodes[1].children(add_self=True)), 4) self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)), 3) + self.assertEqual(nodes[2].siblings, [nodes[0], nodes[3]]) + self.assertEqual(nodes[2].siblings(following_only=True), [nodes[3]]) self.assertEqual(nodes[0].next_node, nodes[1]) self.assertEqual(nodes[2].prev_node, nodes[1]) @@ -59,6 +61,9 @@ def test_topology(self): nodes[0].shift_after_node(nodes[1]) self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6]) self.assertEqual([node.ord for node in root.descendants()], [1, 2, 3, 4, 5, 6]) + self.assertEqual([node.ord for node in nodes[1].children], [2, 3, 4]) + nodes[3].shift_before_node(nodes[2]) + self.assertEqual([node.ord for node in nodes[1].children], [2, 3, 6]) def test_draw(self): """Test the draw() method, which uses udapi.block.write.textmodetrees.""" @@ -117,7 +122,7 @@ def test_draw(self): sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type def test_feats(self): - """Test the morphological featrues.""" + """Test the morphological features.""" node = Node(root=None) self.assertEqual(str(node.feats), '_') node.feats = '' @@ -143,6 +148,29 @@ def test_feats(self): self.assertEqual(str(node.feats), '_') self.assertEqual(node.feats, {}) + def test_deprel(self): + """Test getting setting the dependency relation.""" + node = Node(root=None, deprel='acl:relcl') + self.assertEqual(node.deprel, 'acl:relcl') + self.assertEqual(node.udeprel, 'acl') + self.assertEqual(node.sdeprel, 'relcl') + node.udeprel = 'advcl' + self.assertEqual(node.deprel, 'advcl:relcl') + node.sdeprel = 'tcl' + self.assertEqual(node.deprel, 'advcl:tcl') + node.sdeprel = '' + self.assertEqual(node.deprel, 'advcl') + self.assertEqual(node.udeprel, 'advcl') + self.assertEqual(node.sdeprel, '') + node.udeprel = 'nsubj' + self.assertEqual(node.deprel, 'nsubj') + self.assertEqual(node.udeprel, 'nsubj') + self.assertEqual(node.sdeprel, '') + node.udeprel = 'nsubj:pass:outer' + self.assertEqual(node.deprel, 'nsubj:pass:outer') + self.assertEqual(node.udeprel, 'nsubj') + self.assertEqual(node.sdeprel, 'pass:outer') + def test_deps_getter(self): """Test enhanced dependencies.""" # Create a path to the test CoNLLU file. @@ -186,5 +214,36 @@ def test_deps_setter(self): self.assertEqual(nodes[0].raw_deps, '2:test') + def test_empty_nodes(self): + """Test creation of empty nodes and how their ord is changed when removing nodes.""" + root = Root() + for i in range(3): + root.create_child(form=f'node{i+1}') + + n1, n2, n3 = root.descendants() + n3.parent = n2 + e1 = n1.create_empty_child('dep', after=False, form='e1') + e2 = n1.create_empty_child('dep', after=False, form='e2') + e3 = n1.create_empty_child('dep', after=True, form='e3') + e4 = n1.create_empty_child('dep', after=True, form='e4') + e5 = n2.create_empty_child('dep', after=False, form='e5') + e6 = n1.create_empty_child('dep', after=True, form='e6') + + self.assertEqual(root.empty_nodes, [e1, e2, e3, e4, e5, e6]) + self.assertEqual(root.descendants_and_empty, [e1, e2, n1, e3, e4, e5, e6, n2, n3]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 1, 1.1, 1.2, 1.3, 1.4, 2, 3]) + e5.remove() + self.assertEqual(root.descendants_and_empty, [e1, e2, n1, e3, e4, e6, n2, n3]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 1, 1.1, 1.2, 1.3, 2, 3]) + n1.remove() + self.assertEqual(root.descendants_and_empty, [e1, e2, e3, e4, e6, n2, n3]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2]) + e7 = n3.create_empty_child('dep', after=True, form='e7') + self.assertEqual(root.descendants_and_empty, [e1, e2, e3, e4, e6, n2, n3, e7]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 2.1]) + n2.remove() + self.assertEqual(root.descendants_and_empty, [e1, e2, e3, e4, e6, e7]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) + if __name__ == "__main__": unittest.main() diff --git a/udapi/tool/udpipe.py b/udapi/tool/udpipe.py index 18f6b2ca..83e289a2 100644 --- a/udapi/tool/udpipe.py +++ b/udapi/tool/udpipe.py @@ -22,8 +22,10 @@ def __init__(self, model): self.conllu_reader = ConlluReader() self.tokenizer = self.tool.newTokenizer(Model.DEFAULT) - def tag_parse_tree(self, root): + def tag_parse_tree(self, root, tag=True, parse=True): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" + if not tag and not parse: + raise ValueError('tag_parse_tree(root, tag=False, parse=False) does not make sense.') descendants = root.descendants if not descendants: return @@ -34,11 +36,15 @@ def tag_parse_tree(self, root): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = self.conllu_reader.read_tree() - nodes = [root] + descendants + attrs = 'upos xpos lemma feats'.split() if tag else [] + if parse: + attrs.append('deprel') + root.flatten() for parsed_node in parsed_root.descendants: - node = nodes[parsed_node.ord] - node.parent = nodes[parsed_node.parent.ord] - for attr in 'upos xpos lemma feats deprel'.split(): + node = descendants[parsed_node.ord - 1] + if parse: + node.parent = descendants[parsed_node.parent.ord - 1] if parsed_node.parent.ord else root + for attr in attrs: setattr(node, attr, getattr(parsed_node, attr)) # TODO: benchmark which solution is the fastest one. E.g. we could also do @@ -47,11 +53,13 @@ def tag_parse_tree(self, root): # pylint: disable=protected-access #root._children, root._descendants = parsed_root._children, parsed_root._descendants - def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True): + def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True, ranges=False): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. If resegment=True, the returned list of Udapi trees may contain multiple trees. """ + if ranges: + raise ValueError('ranges=True is implemented only in the REST API version (add "online=1" to the udpipe block)') if root.children: raise ValueError('Tree already contained nodes before tokenization') diff --git a/udapi/tool/udpipeonline.py b/udapi/tool/udpipeonline.py new file mode 100644 index 00000000..f0a835c9 --- /dev/null +++ b/udapi/tool/udpipeonline.py @@ -0,0 +1,168 @@ +"""Wrapper for UDPipe online web service.""" +import io +import sys +import email.mime.multipart +import email.mime.nonmultipart +import email.policy +import json +import os +import sys +import urllib.error +import urllib.request + +from udapi.block.read.conllu import Conllu as ConlluReader +from udapi.core.root import Root + +class UDPipeOnline: + """Wrapper for UDPipe online web service.""" + + def __init__(self, model, server="https://lindat.mff.cuni.cz/services/udpipe/api"): + """Create the UDPipeOnline tool object.""" + self.model = model + self.server = server + + def list_models(self): + with urllib.request.urlopen(self.server + "/models") as request: + response = json.loads(request.read()) + return list(response["models"].keys()) + + def perform_request(self, params, method="process"): + if not params: + request_headers, request_data = {}, None + else: + message = email.mime.multipart.MIMEMultipart("form-data", policy=email.policy.HTTP) + + for name, value in params.items(): + payload = email.mime.nonmultipart.MIMENonMultipart("text", "plain") + payload.add_header("Content-Disposition", "form-data; name=\"{}\"".format(name)) + payload.add_header("Content-Transfer-Encoding", "8bit") + payload.set_payload(value, charset="utf-8") + message.attach(payload) + + request_data = message.as_bytes().split(b"\r\n\r\n", maxsplit=1)[1] + request_headers = {"Content-Type": message["Content-Type"]} + + try: + with urllib.request.urlopen(urllib.request.Request( + url=f"{self.server}/{method}", headers=request_headers, data=request_data + )) as request: + response = json.loads(request.read()) + except urllib.error.HTTPError as e: + print("An exception was raised during UDPipe 'process' REST request.\n" + "The service returned the following error:\n" + " {}".format(e.fp.read().decode("utf-8")), file=sys.stderr) + raise + except json.JSONDecodeError as e: + print("Cannot parse the JSON response of UDPipe 'process' REST request.\n" + " {}".format(e.msg), file=sys.stderr) + raise + + if "model" not in response or "result" not in response: + raise ValueError("Cannot parse the UDPipe 'process' REST request response.") + + return response["result"] + + def tag_parse_tree(self, root, tag=True, parse=True): + """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" + if not tag and not parse: + raise ValueError('tag_parse_tree(root, tag=False, parse=False) does not make sense.') + descendants = root.descendants + if not descendants: + return + in_data = " ".join([n.form for n in descendants]) + params = {"model": self.model, "data": in_data, "input":"horizontal", "tagger":""} + attrs = 'upos xpos lemma feats'.split() if tag else [] + if parse: + params["parser"] = "" + attrs.append('deprel') + + out_data = self.perform_request(params=params) + conllu_reader = ConlluReader(empty_parent="ignore") + conllu_reader.files.filehandle = io.StringIO(out_data) + parsed_root = conllu_reader.read_tree() + if parse: + root.flatten() + for parsed_node in parsed_root.descendants: + node = descendants[parsed_node.ord - 1] + if parse: + node.parent = descendants[parsed_node.parent.ord - 1] if parsed_node.parent.ord else root + for attr in attrs: + setattr(node, attr, getattr(parsed_node, attr)) + + def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True, ranges=False): + """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. + + If resegment=True, the returned list of Udapi trees may contain multiple trees. + If ranges=True, each token will contain `node.misc[TokenRange]` will contain character level 0-based ranges, e.g. `0:2`. + """ + if parse and not tag: + raise ValueError('Combination parse=True tag=False is not allowed.') + if root.children: + raise ValueError('Tree already contained nodes before tokenization') + + # Tokenize and possibly segment the input text + params = {"model": self.model, "data": root.text, "tokenizer":"" if resegment else "presegmented"} + if tag: + params["tagger"] = "" + if parse: + params["parser"] = "" + if ranges: + params["tokenizer"] = "presegmented;ranges" if resegment else "ranges" + out_data = self.perform_request(params=params) + conllu_reader = ConlluReader(empty_parent="ignore") + conllu_reader.files.filehandle = io.StringIO(out_data) + trees = conllu_reader.read_trees() + + # The input "root" object must be the first item in "trees". + for attr in ('_children', '_descendants', '_mwts', 'text', 'comment'): + setattr(root, attr, getattr(trees[0], attr)) + for node in root._children: + node._parent = root + for node in root._descendants: + node._root = root + trees[0] = root + return trees + + def segment_text(self, text): + """Segment the provided text into sentences returned as a Python list.""" + params = {"model": self.model, "data": text, "tokenizer":"", "output": "plaintext=normalized_spaces"} + return self.perform_request(params=params).rstrip().split("\n") + + def process_document(self, doc, tokenize=True, tag=True, parse=True, resegment=False, ranges=False): + """Delete all existing bundles and substitute them with those parsed by UDPipe.""" + if parse and not tag: + raise ValueError('Combination parse=True tag=False is not allowed.') + params = {"model": self.model, "tokenizer": "presegmented"} + if tag: + params["tagger"] = "" + if parse: + params["parser"] = "" + if resegment: + params["tokenizer"] = "" + if ranges: + params["tokenizer"] = "ranges" if resegment else "presegmented;ranges" + + #in_trees = [] + #for bundle in doc.bundles: + # assert(len(bundle.trees) == 1) + # in_trees.append(bundle.trees[0]) + if tokenize: + params["data"] = "\n".join(root.text for root in doc.trees) + "\n" + else: + params["input"] = "horizontal" + params["data"] = "\n".join(" ".join([n.form for n in root.descendants]) for root in doc.trees) + "\n" + + out_data = self.perform_request(params=params) + conllu_reader = ConlluReader(empty_parent="ignore") + conllu_reader.files.filehandle = io.StringIO(out_data) + trees = conllu_reader.read_trees() + + bundles = list(reversed(doc.bundles)) + for tree in trees: + if bundles: + bundle = bundles.pop() + # TODO is this safe? + bundle.trees = [] + else: + bundle = doc.create_bundle() + bundle.add_tree(tree) pFad - Phonifier reborn

    Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

    Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


    Alternative Proxies:

    Alternative Proxy

    pFad Proxy

    pFad v3 Proxy

    pFad v4 Proxy