Initial commit
This commit is contained in:
52
backend/venv/Lib/site-packages/nltk/tree/__init__.py
Normal file
52
backend/venv/Lib/site-packages/nltk/tree/__init__.py
Normal file
@@ -0,0 +1,52 @@
|
||||
# Natural Language Toolkit: Machine Translation
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Peter Ljunglöf <peter.ljunglof@gu.se>
|
||||
# Tom Aarsen <>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
NLTK Tree Package
|
||||
|
||||
This package may be used for representing hierarchical language
|
||||
structures, such as syntax trees and morphological trees.
|
||||
"""
|
||||
|
||||
# TODO: add LabelledTree (can be used for dependency trees)
|
||||
|
||||
from nltk.tree.immutable import (
|
||||
ImmutableMultiParentedTree,
|
||||
ImmutableParentedTree,
|
||||
ImmutableProbabilisticTree,
|
||||
ImmutableTree,
|
||||
)
|
||||
from nltk.tree.parented import MultiParentedTree, ParentedTree
|
||||
from nltk.tree.parsing import bracket_parse, sinica_parse
|
||||
from nltk.tree.prettyprinter import TreePrettyPrinter
|
||||
from nltk.tree.probabilistic import ProbabilisticTree
|
||||
from nltk.tree.transforms import (
|
||||
chomsky_normal_form,
|
||||
collapse_unary,
|
||||
un_chomsky_normal_form,
|
||||
)
|
||||
from nltk.tree.tree import Tree
|
||||
|
||||
__all__ = [
|
||||
"ImmutableMultiParentedTree",
|
||||
"ImmutableParentedTree",
|
||||
"ImmutableProbabilisticTree",
|
||||
"ImmutableTree",
|
||||
"MultiParentedTree",
|
||||
"ParentedTree",
|
||||
"bracket_parse",
|
||||
"sinica_parse",
|
||||
"TreePrettyPrinter",
|
||||
"ProbabilisticTree",
|
||||
"chomsky_normal_form",
|
||||
"collapse_unary",
|
||||
"un_chomsky_normal_form",
|
||||
"Tree",
|
||||
]
|
||||
124
backend/venv/Lib/site-packages/nltk/tree/immutable.py
Normal file
124
backend/venv/Lib/site-packages/nltk/tree/immutable.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# Natural Language Toolkit: Text Trees
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Peter Ljunglöf <peter.ljunglof@gu.se>
|
||||
# Tom Aarsen <>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.probability import ProbabilisticMixIn
|
||||
from nltk.tree.parented import MultiParentedTree, ParentedTree
|
||||
from nltk.tree.tree import Tree
|
||||
|
||||
|
||||
class ImmutableTree(Tree):
|
||||
def __init__(self, node, children=None):
|
||||
super().__init__(node, children)
|
||||
# Precompute our hash value. This ensures that we're really
|
||||
# immutable. It also means we only have to calculate it once.
|
||||
try:
|
||||
self._hash = hash((self._label, tuple(self)))
|
||||
except (TypeError, ValueError) as e:
|
||||
raise ValueError(
|
||||
"%s: node value and children " "must be immutable" % type(self).__name__
|
||||
) from e
|
||||
|
||||
def __setitem__(self, index, value):
|
||||
raise ValueError("%s may not be modified" % type(self).__name__)
|
||||
|
||||
def __setslice__(self, i, j, value):
|
||||
raise ValueError("%s may not be modified" % type(self).__name__)
|
||||
|
||||
def __delitem__(self, index):
|
||||
raise ValueError("%s may not be modified" % type(self).__name__)
|
||||
|
||||
def __delslice__(self, i, j):
|
||||
raise ValueError("%s may not be modified" % type(self).__name__)
|
||||
|
||||
def __iadd__(self, other):
|
||||
raise ValueError("%s may not be modified" % type(self).__name__)
|
||||
|
||||
def __imul__(self, other):
|
||||
raise ValueError("%s may not be modified" % type(self).__name__)
|
||||
|
||||
def append(self, v):
|
||||
raise ValueError("%s may not be modified" % type(self).__name__)
|
||||
|
||||
def extend(self, v):
|
||||
raise ValueError("%s may not be modified" % type(self).__name__)
|
||||
|
||||
def pop(self, v=None):
|
||||
raise ValueError("%s may not be modified" % type(self).__name__)
|
||||
|
||||
def remove(self, v):
|
||||
raise ValueError("%s may not be modified" % type(self).__name__)
|
||||
|
||||
def reverse(self):
|
||||
raise ValueError("%s may not be modified" % type(self).__name__)
|
||||
|
||||
def sort(self):
|
||||
raise ValueError("%s may not be modified" % type(self).__name__)
|
||||
|
||||
def __hash__(self):
|
||||
return self._hash
|
||||
|
||||
def set_label(self, value):
|
||||
"""
|
||||
Set the node label. This will only succeed the first time the
|
||||
node label is set, which should occur in ImmutableTree.__init__().
|
||||
"""
|
||||
if hasattr(self, "_label"):
|
||||
raise ValueError("%s may not be modified" % type(self).__name__)
|
||||
self._label = value
|
||||
|
||||
|
||||
class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn):
|
||||
def __init__(self, node, children=None, **prob_kwargs):
|
||||
ImmutableTree.__init__(self, node, children)
|
||||
ProbabilisticMixIn.__init__(self, **prob_kwargs)
|
||||
self._hash = hash((self._label, tuple(self), self.prob()))
|
||||
|
||||
# We have to patch up these methods to make them work right:
|
||||
def _frozen_class(self):
|
||||
return ImmutableProbabilisticTree
|
||||
|
||||
def __repr__(self):
|
||||
return f"{Tree.__repr__(self)} [{self.prob()}]"
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.pformat(margin=60)} [{self.prob()}]"
|
||||
|
||||
def copy(self, deep=False):
|
||||
if not deep:
|
||||
return type(self)(self._label, self, prob=self.prob())
|
||||
else:
|
||||
return type(self).convert(self)
|
||||
|
||||
@classmethod
|
||||
def convert(cls, val):
|
||||
if isinstance(val, Tree):
|
||||
children = [cls.convert(child) for child in val]
|
||||
if isinstance(val, ProbabilisticMixIn):
|
||||
return cls(val._label, children, prob=val.prob())
|
||||
else:
|
||||
return cls(val._label, children, prob=1.0)
|
||||
else:
|
||||
return val
|
||||
|
||||
|
||||
class ImmutableParentedTree(ImmutableTree, ParentedTree):
|
||||
pass
|
||||
|
||||
|
||||
class ImmutableMultiParentedTree(ImmutableTree, MultiParentedTree):
|
||||
pass
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ImmutableProbabilisticTree",
|
||||
"ImmutableTree",
|
||||
"ImmutableParentedTree",
|
||||
"ImmutableMultiParentedTree",
|
||||
]
|
||||
590
backend/venv/Lib/site-packages/nltk/tree/parented.py
Normal file
590
backend/venv/Lib/site-packages/nltk/tree/parented.py
Normal file
@@ -0,0 +1,590 @@
|
||||
# Natural Language Toolkit: Text Trees
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Peter Ljunglöf <peter.ljunglof@gu.se>
|
||||
# Tom Aarsen <>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import warnings
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
from nltk.tree.tree import Tree
|
||||
from nltk.util import slice_bounds
|
||||
|
||||
|
||||
######################################################################
|
||||
## Parented trees
|
||||
######################################################################
|
||||
class AbstractParentedTree(Tree, metaclass=ABCMeta):
|
||||
"""
|
||||
An abstract base class for a ``Tree`` that automatically maintains
|
||||
pointers to parent nodes. These parent pointers are updated
|
||||
whenever any change is made to a tree's structure. Two subclasses
|
||||
are currently defined:
|
||||
|
||||
- ``ParentedTree`` is used for tree structures where each subtree
|
||||
has at most one parent. This class should be used in cases
|
||||
where there is no"sharing" of subtrees.
|
||||
|
||||
- ``MultiParentedTree`` is used for tree structures where a
|
||||
subtree may have zero or more parents. This class should be
|
||||
used in cases where subtrees may be shared.
|
||||
|
||||
Subclassing
|
||||
===========
|
||||
The ``AbstractParentedTree`` class redefines all operations that
|
||||
modify a tree's structure to call two methods, which are used by
|
||||
subclasses to update parent information:
|
||||
|
||||
- ``_setparent()`` is called whenever a new child is added.
|
||||
- ``_delparent()`` is called whenever a child is removed.
|
||||
"""
|
||||
|
||||
def __init__(self, node, children=None):
|
||||
super().__init__(node, children)
|
||||
# If children is None, the tree is read from node, and
|
||||
# all parents will be set during parsing.
|
||||
if children is not None:
|
||||
# Otherwise we have to set the parent of the children.
|
||||
# Iterate over self, and *not* children, because children
|
||||
# might be an iterator.
|
||||
for i, child in enumerate(self):
|
||||
if isinstance(child, Tree):
|
||||
self._setparent(child, i, dry_run=True)
|
||||
for i, child in enumerate(self):
|
||||
if isinstance(child, Tree):
|
||||
self._setparent(child, i)
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Parent management
|
||||
# ////////////////////////////////////////////////////////////
|
||||
@abstractmethod
|
||||
def _setparent(self, child, index, dry_run=False):
|
||||
"""
|
||||
Update the parent pointer of ``child`` to point to ``self``. This
|
||||
method is only called if the type of ``child`` is ``Tree``;
|
||||
i.e., it is not called when adding a leaf to a tree. This method
|
||||
is always called before the child is actually added to the
|
||||
child list of ``self``.
|
||||
|
||||
:type child: Tree
|
||||
:type index: int
|
||||
:param index: The index of ``child`` in ``self``.
|
||||
:raise TypeError: If ``child`` is a tree with an impropriate
|
||||
type. Typically, if ``child`` is a tree, then its type needs
|
||||
to match the type of ``self``. This prevents mixing of
|
||||
different tree types (single-parented, multi-parented, and
|
||||
non-parented).
|
||||
:param dry_run: If true, the don't actually set the child's
|
||||
parent pointer; just check for any error conditions, and
|
||||
raise an exception if one is found.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def _delparent(self, child, index):
|
||||
"""
|
||||
Update the parent pointer of ``child`` to not point to self. This
|
||||
method is only called if the type of ``child`` is ``Tree``; i.e., it
|
||||
is not called when removing a leaf from a tree. This method
|
||||
is always called before the child is actually removed from the
|
||||
child list of ``self``.
|
||||
|
||||
:type child: Tree
|
||||
:type index: int
|
||||
:param index: The index of ``child`` in ``self``.
|
||||
"""
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Methods that add/remove children
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Every method that adds or removes a child must make
|
||||
# appropriate calls to _setparent() and _delparent().
|
||||
|
||||
def __delitem__(self, index):
|
||||
# del ptree[start:stop]
|
||||
if isinstance(index, slice):
|
||||
start, stop, step = slice_bounds(self, index, allow_step=True)
|
||||
# Clear all the children pointers.
|
||||
for i in range(start, stop, step):
|
||||
if isinstance(self[i], Tree):
|
||||
self._delparent(self[i], i)
|
||||
# Delete the children from our child list.
|
||||
super().__delitem__(index)
|
||||
|
||||
# del ptree[i]
|
||||
elif isinstance(index, int):
|
||||
if index < 0:
|
||||
index += len(self)
|
||||
if index < 0:
|
||||
raise IndexError("index out of range")
|
||||
# Clear the child's parent pointer.
|
||||
if isinstance(self[index], Tree):
|
||||
self._delparent(self[index], index)
|
||||
# Remove the child from our child list.
|
||||
super().__delitem__(index)
|
||||
|
||||
elif isinstance(index, (list, tuple)):
|
||||
# del ptree[()]
|
||||
if len(index) == 0:
|
||||
raise IndexError("The tree position () may not be deleted.")
|
||||
# del ptree[(i,)]
|
||||
elif len(index) == 1:
|
||||
del self[index[0]]
|
||||
# del ptree[i1, i2, i3]
|
||||
else:
|
||||
del self[index[0]][index[1:]]
|
||||
|
||||
else:
|
||||
raise TypeError(
|
||||
"%s indices must be integers, not %s"
|
||||
% (type(self).__name__, type(index).__name__)
|
||||
)
|
||||
|
||||
def __setitem__(self, index, value):
|
||||
# ptree[start:stop] = value
|
||||
if isinstance(index, slice):
|
||||
start, stop, step = slice_bounds(self, index, allow_step=True)
|
||||
# make a copy of value, in case it's an iterator
|
||||
if not isinstance(value, (list, tuple)):
|
||||
value = list(value)
|
||||
# Check for any error conditions, so we can avoid ending
|
||||
# up in an inconsistent state if an error does occur.
|
||||
for i, child in enumerate(value):
|
||||
if isinstance(child, Tree):
|
||||
self._setparent(child, start + i * step, dry_run=True)
|
||||
# clear the child pointers of all parents we're removing
|
||||
for i in range(start, stop, step):
|
||||
if isinstance(self[i], Tree):
|
||||
self._delparent(self[i], i)
|
||||
# set the child pointers of the new children. We do this
|
||||
# after clearing *all* child pointers, in case we're e.g.
|
||||
# reversing the elements in a tree.
|
||||
for i, child in enumerate(value):
|
||||
if isinstance(child, Tree):
|
||||
self._setparent(child, start + i * step)
|
||||
# finally, update the content of the child list itself.
|
||||
super().__setitem__(index, value)
|
||||
|
||||
# ptree[i] = value
|
||||
elif isinstance(index, int):
|
||||
if index < 0:
|
||||
index += len(self)
|
||||
if index < 0:
|
||||
raise IndexError("index out of range")
|
||||
# if the value is not changing, do nothing.
|
||||
if value is self[index]:
|
||||
return
|
||||
# Set the new child's parent pointer.
|
||||
if isinstance(value, Tree):
|
||||
self._setparent(value, index)
|
||||
# Remove the old child's parent pointer
|
||||
if isinstance(self[index], Tree):
|
||||
self._delparent(self[index], index)
|
||||
# Update our child list.
|
||||
super().__setitem__(index, value)
|
||||
|
||||
elif isinstance(index, (list, tuple)):
|
||||
# ptree[()] = value
|
||||
if len(index) == 0:
|
||||
raise IndexError("The tree position () may not be assigned to.")
|
||||
# ptree[(i,)] = value
|
||||
elif len(index) == 1:
|
||||
self[index[0]] = value
|
||||
# ptree[i1, i2, i3] = value
|
||||
else:
|
||||
self[index[0]][index[1:]] = value
|
||||
|
||||
else:
|
||||
raise TypeError(
|
||||
"%s indices must be integers, not %s"
|
||||
% (type(self).__name__, type(index).__name__)
|
||||
)
|
||||
|
||||
def append(self, child):
|
||||
if isinstance(child, Tree):
|
||||
self._setparent(child, len(self))
|
||||
super().append(child)
|
||||
|
||||
def extend(self, children):
|
||||
for child in children:
|
||||
if isinstance(child, Tree):
|
||||
self._setparent(child, len(self))
|
||||
super().append(child)
|
||||
|
||||
def insert(self, index, child):
|
||||
# Handle negative indexes. Note that if index < -len(self),
|
||||
# we do *not* raise an IndexError, unlike __getitem__. This
|
||||
# is done for consistency with list.__getitem__ and list.index.
|
||||
if index < 0:
|
||||
index += len(self)
|
||||
if index < 0:
|
||||
index = 0
|
||||
# Set the child's parent, and update our child list.
|
||||
if isinstance(child, Tree):
|
||||
self._setparent(child, index)
|
||||
super().insert(index, child)
|
||||
|
||||
def pop(self, index=-1):
|
||||
if index < 0:
|
||||
index += len(self)
|
||||
if index < 0:
|
||||
raise IndexError("index out of range")
|
||||
if isinstance(self[index], Tree):
|
||||
self._delparent(self[index], index)
|
||||
return super().pop(index)
|
||||
|
||||
# n.b.: like `list`, this is done by equality, not identity!
|
||||
# To remove a specific child, use del ptree[i].
|
||||
def remove(self, child):
|
||||
index = self.index(child)
|
||||
if isinstance(self[index], Tree):
|
||||
self._delparent(self[index], index)
|
||||
super().remove(child)
|
||||
|
||||
# We need to implement __getslice__ and friends, even though
|
||||
# they're deprecated, because otherwise list.__getslice__ will get
|
||||
# called (since we're subclassing from list). Just delegate to
|
||||
# __getitem__ etc., but use max(0, start) and max(0, stop) because
|
||||
# because negative indices are already handled *before*
|
||||
# __getslice__ is called; and we don't want to double-count them.
|
||||
if hasattr(list, "__getslice__"):
|
||||
|
||||
def __getslice__(self, start, stop):
|
||||
return self.__getitem__(slice(max(0, start), max(0, stop)))
|
||||
|
||||
def __delslice__(self, start, stop):
|
||||
return self.__delitem__(slice(max(0, start), max(0, stop)))
|
||||
|
||||
def __setslice__(self, start, stop, value):
|
||||
return self.__setitem__(slice(max(0, start), max(0, stop)), value)
|
||||
|
||||
def __getnewargs__(self):
|
||||
"""Method used by the pickle module when un-pickling.
|
||||
This method provides the arguments passed to ``__new__``
|
||||
upon un-pickling. Without this method, ParentedTree instances
|
||||
cannot be pickled and unpickled in Python 3.7+ onwards.
|
||||
|
||||
:return: Tuple of arguments for ``__new__``, i.e. the label
|
||||
and the children of this node.
|
||||
:rtype: Tuple[Any, List[AbstractParentedTree]]
|
||||
"""
|
||||
return (self._label, list(self))
|
||||
|
||||
|
||||
class ParentedTree(AbstractParentedTree):
|
||||
"""
|
||||
A ``Tree`` that automatically maintains parent pointers for
|
||||
single-parented trees. The following are methods for querying
|
||||
the structure of a parented tree: ``parent``, ``parent_index``,
|
||||
``left_sibling``, ``right_sibling``, ``root``, ``treeposition``.
|
||||
|
||||
Each ``ParentedTree`` may have at most one parent. In
|
||||
particular, subtrees may not be shared. Any attempt to reuse a
|
||||
single ``ParentedTree`` as a child of more than one parent (or
|
||||
as multiple children of the same parent) will cause a
|
||||
``ValueError`` exception to be raised.
|
||||
|
||||
``ParentedTrees`` should never be used in the same tree as ``Trees``
|
||||
or ``MultiParentedTrees``. Mixing tree implementations may result
|
||||
in incorrect parent pointers and in ``TypeError`` exceptions.
|
||||
"""
|
||||
|
||||
def __init__(self, node, children=None):
|
||||
self._parent = None
|
||||
"""The parent of this Tree, or None if it has no parent."""
|
||||
super().__init__(node, children)
|
||||
if children is None:
|
||||
# If children is None, the tree is read from node.
|
||||
# After parsing, the parent of the immediate children
|
||||
# will point to an intermediate tree, not self.
|
||||
# We fix this by brute force:
|
||||
for i, child in enumerate(self):
|
||||
if isinstance(child, Tree):
|
||||
child._parent = None
|
||||
self._setparent(child, i)
|
||||
|
||||
def _frozen_class(self):
|
||||
from nltk.tree.immutable import ImmutableParentedTree
|
||||
|
||||
return ImmutableParentedTree
|
||||
|
||||
def copy(self, deep=False):
|
||||
if not deep:
|
||||
warnings.warn(
|
||||
f"{self.__class__.__name__} objects do not support shallow copies. Defaulting to a deep copy."
|
||||
)
|
||||
return super().copy(deep=True)
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Methods
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def parent(self):
|
||||
"""The parent of this tree, or None if it has no parent."""
|
||||
return self._parent
|
||||
|
||||
def parent_index(self):
|
||||
"""
|
||||
The index of this tree in its parent. I.e.,
|
||||
``ptree.parent()[ptree.parent_index()] is ptree``. Note that
|
||||
``ptree.parent_index()`` is not necessarily equal to
|
||||
``ptree.parent.index(ptree)``, since the ``index()`` method
|
||||
returns the first child that is equal to its argument.
|
||||
"""
|
||||
if self._parent is None:
|
||||
return None
|
||||
for i, child in enumerate(self._parent):
|
||||
if child is self:
|
||||
return i
|
||||
assert False, "expected to find self in self._parent!"
|
||||
|
||||
def left_sibling(self):
|
||||
"""The left sibling of this tree, or None if it has none."""
|
||||
parent_index = self.parent_index()
|
||||
if self._parent and parent_index > 0:
|
||||
return self._parent[parent_index - 1]
|
||||
return None # no left sibling
|
||||
|
||||
def right_sibling(self):
|
||||
"""The right sibling of this tree, or None if it has none."""
|
||||
parent_index = self.parent_index()
|
||||
if self._parent and parent_index < (len(self._parent) - 1):
|
||||
return self._parent[parent_index + 1]
|
||||
return None # no right sibling
|
||||
|
||||
def root(self):
|
||||
"""
|
||||
The root of this tree. I.e., the unique ancestor of this tree
|
||||
whose parent is None. If ``ptree.parent()`` is None, then
|
||||
``ptree`` is its own root.
|
||||
"""
|
||||
root = self
|
||||
while root.parent() is not None:
|
||||
root = root.parent()
|
||||
return root
|
||||
|
||||
def treeposition(self):
|
||||
"""
|
||||
The tree position of this tree, relative to the root of the
|
||||
tree. I.e., ``ptree.root[ptree.treeposition] is ptree``.
|
||||
"""
|
||||
if self.parent() is None:
|
||||
return ()
|
||||
else:
|
||||
return self.parent().treeposition() + (self.parent_index(),)
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Parent Management
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def _delparent(self, child, index):
|
||||
# Sanity checks
|
||||
assert isinstance(child, ParentedTree)
|
||||
assert self[index] is child
|
||||
assert child._parent is self
|
||||
|
||||
# Delete child's parent pointer.
|
||||
child._parent = None
|
||||
|
||||
def _setparent(self, child, index, dry_run=False):
|
||||
# If the child's type is incorrect, then complain.
|
||||
if not isinstance(child, ParentedTree):
|
||||
raise TypeError("Can not insert a non-ParentedTree into a ParentedTree")
|
||||
|
||||
# If child already has a parent, then complain.
|
||||
if hasattr(child, "_parent") and child._parent is not None:
|
||||
raise ValueError("Can not insert a subtree that already has a parent.")
|
||||
|
||||
# Set child's parent pointer & index.
|
||||
if not dry_run:
|
||||
child._parent = self
|
||||
|
||||
|
||||
class MultiParentedTree(AbstractParentedTree):
|
||||
"""
|
||||
A ``Tree`` that automatically maintains parent pointers for
|
||||
multi-parented trees. The following are methods for querying the
|
||||
structure of a multi-parented tree: ``parents()``, ``parent_indices()``,
|
||||
``left_siblings()``, ``right_siblings()``, ``roots``, ``treepositions``.
|
||||
|
||||
Each ``MultiParentedTree`` may have zero or more parents. In
|
||||
particular, subtrees may be shared. If a single
|
||||
``MultiParentedTree`` is used as multiple children of the same
|
||||
parent, then that parent will appear multiple times in its
|
||||
``parents()`` method.
|
||||
|
||||
``MultiParentedTrees`` should never be used in the same tree as
|
||||
``Trees`` or ``ParentedTrees``. Mixing tree implementations may
|
||||
result in incorrect parent pointers and in ``TypeError`` exceptions.
|
||||
"""
|
||||
|
||||
def __init__(self, node, children=None):
|
||||
self._parents = []
|
||||
"""A list of this tree's parents. This list should not
|
||||
contain duplicates, even if a parent contains this tree
|
||||
multiple times."""
|
||||
super().__init__(node, children)
|
||||
if children is None:
|
||||
# If children is None, the tree is read from node.
|
||||
# After parsing, the parent(s) of the immediate children
|
||||
# will point to an intermediate tree, not self.
|
||||
# We fix this by brute force:
|
||||
for i, child in enumerate(self):
|
||||
if isinstance(child, Tree):
|
||||
child._parents = []
|
||||
self._setparent(child, i)
|
||||
|
||||
def _frozen_class(self):
|
||||
from nltk.tree.immutable import ImmutableMultiParentedTree
|
||||
|
||||
return ImmutableMultiParentedTree
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Methods
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def parents(self):
|
||||
"""
|
||||
The set of parents of this tree. If this tree has no parents,
|
||||
then ``parents`` is the empty set. To check if a tree is used
|
||||
as multiple children of the same parent, use the
|
||||
``parent_indices()`` method.
|
||||
|
||||
:type: list(MultiParentedTree)
|
||||
"""
|
||||
return list(self._parents)
|
||||
|
||||
def left_siblings(self):
|
||||
"""
|
||||
A list of all left siblings of this tree, in any of its parent
|
||||
trees. A tree may be its own left sibling if it is used as
|
||||
multiple contiguous children of the same parent. A tree may
|
||||
appear multiple times in this list if it is the left sibling
|
||||
of this tree with respect to multiple parents.
|
||||
|
||||
:type: list(MultiParentedTree)
|
||||
"""
|
||||
return [
|
||||
parent[index - 1]
|
||||
for (parent, index) in self._get_parent_indices()
|
||||
if index > 0
|
||||
]
|
||||
|
||||
def right_siblings(self):
|
||||
"""
|
||||
A list of all right siblings of this tree, in any of its parent
|
||||
trees. A tree may be its own right sibling if it is used as
|
||||
multiple contiguous children of the same parent. A tree may
|
||||
appear multiple times in this list if it is the right sibling
|
||||
of this tree with respect to multiple parents.
|
||||
|
||||
:type: list(MultiParentedTree)
|
||||
"""
|
||||
return [
|
||||
parent[index + 1]
|
||||
for (parent, index) in self._get_parent_indices()
|
||||
if index < (len(parent) - 1)
|
||||
]
|
||||
|
||||
def _get_parent_indices(self):
|
||||
return [
|
||||
(parent, index)
|
||||
for parent in self._parents
|
||||
for index, child in enumerate(parent)
|
||||
if child is self
|
||||
]
|
||||
|
||||
def roots(self):
|
||||
"""
|
||||
The set of all roots of this tree. This set is formed by
|
||||
tracing all possible parent paths until trees with no parents
|
||||
are found.
|
||||
|
||||
:type: list(MultiParentedTree)
|
||||
"""
|
||||
return list(self._get_roots_helper({}).values())
|
||||
|
||||
def _get_roots_helper(self, result):
|
||||
if self._parents:
|
||||
for parent in self._parents:
|
||||
parent._get_roots_helper(result)
|
||||
else:
|
||||
result[id(self)] = self
|
||||
return result
|
||||
|
||||
def parent_indices(self, parent):
|
||||
"""
|
||||
Return a list of the indices where this tree occurs as a child
|
||||
of ``parent``. If this child does not occur as a child of
|
||||
``parent``, then the empty list is returned. The following is
|
||||
always true::
|
||||
|
||||
for parent_index in ptree.parent_indices(parent):
|
||||
parent[parent_index] is ptree
|
||||
"""
|
||||
if parent not in self._parents:
|
||||
return []
|
||||
else:
|
||||
return [index for (index, child) in enumerate(parent) if child is self]
|
||||
|
||||
def treepositions(self, root):
|
||||
"""
|
||||
Return a list of all tree positions that can be used to reach
|
||||
this multi-parented tree starting from ``root``. I.e., the
|
||||
following is always true::
|
||||
|
||||
for treepos in ptree.treepositions(root):
|
||||
root[treepos] is ptree
|
||||
"""
|
||||
if self is root:
|
||||
return [()]
|
||||
else:
|
||||
return [
|
||||
treepos + (index,)
|
||||
for parent in self._parents
|
||||
for treepos in parent.treepositions(root)
|
||||
for (index, child) in enumerate(parent)
|
||||
if child is self
|
||||
]
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Parent Management
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def _delparent(self, child, index):
|
||||
# Sanity checks
|
||||
assert isinstance(child, MultiParentedTree)
|
||||
assert self[index] is child
|
||||
assert len([p for p in child._parents if p is self]) == 1
|
||||
|
||||
# If the only copy of child in self is at index, then delete
|
||||
# self from child's parent list.
|
||||
for i, c in enumerate(self):
|
||||
if c is child and i != index:
|
||||
break
|
||||
else:
|
||||
child._parents.remove(self)
|
||||
|
||||
def _setparent(self, child, index, dry_run=False):
|
||||
# If the child's type is incorrect, then complain.
|
||||
if not isinstance(child, MultiParentedTree):
|
||||
raise TypeError(
|
||||
"Can not insert a non-MultiParentedTree into a MultiParentedTree"
|
||||
)
|
||||
|
||||
# Add self as a parent pointer if it's not already listed.
|
||||
if not dry_run:
|
||||
for parent in child._parents:
|
||||
if parent is self:
|
||||
break
|
||||
else:
|
||||
child._parents.append(self)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ParentedTree",
|
||||
"MultiParentedTree",
|
||||
]
|
||||
66
backend/venv/Lib/site-packages/nltk/tree/parsing.py
Normal file
66
backend/venv/Lib/site-packages/nltk/tree/parsing.py
Normal file
@@ -0,0 +1,66 @@
|
||||
# Natural Language Toolkit: Text Trees
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Peter Ljunglöf <peter.ljunglof@gu.se>
|
||||
# Tom Aarsen <>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import re
|
||||
|
||||
from nltk.tree.tree import Tree
|
||||
|
||||
######################################################################
|
||||
## Parsing
|
||||
######################################################################
|
||||
|
||||
|
||||
def bracket_parse(s):
|
||||
"""
|
||||
Use Tree.read(s, remove_empty_top_bracketing=True) instead.
|
||||
"""
|
||||
raise NameError("Use Tree.read(s, remove_empty_top_bracketing=True) instead.")
|
||||
|
||||
|
||||
def sinica_parse(s):
|
||||
"""
|
||||
Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings,
|
||||
as shown in the following example (X represents a Chinese character):
|
||||
S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY)
|
||||
|
||||
:return: A tree corresponding to the string representation.
|
||||
:rtype: Tree
|
||||
:param s: The string to be converted
|
||||
:type s: str
|
||||
"""
|
||||
tokens = re.split(r"([()| ])", s)
|
||||
for i in range(len(tokens)):
|
||||
if tokens[i] == "(":
|
||||
tokens[i - 1], tokens[i] = (
|
||||
tokens[i],
|
||||
tokens[i - 1],
|
||||
) # pull nonterminal inside parens
|
||||
elif ":" in tokens[i]:
|
||||
fields = tokens[i].split(":")
|
||||
if len(fields) == 2: # non-terminal
|
||||
tokens[i] = fields[1]
|
||||
else:
|
||||
tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")"
|
||||
elif tokens[i] == "|":
|
||||
tokens[i] = ""
|
||||
|
||||
treebank_string = " ".join(tokens)
|
||||
return Tree.fromstring(treebank_string, remove_empty_top_bracketing=True)
|
||||
|
||||
|
||||
# s = re.sub(r'^#[^\s]*\s', '', s) # remove leading identifier
|
||||
# s = re.sub(r'\w+:', '', s) # remove role tags
|
||||
|
||||
# return s
|
||||
|
||||
__all__ = [
|
||||
"bracket_parse",
|
||||
"sinica_parse",
|
||||
]
|
||||
627
backend/venv/Lib/site-packages/nltk/tree/prettyprinter.py
Normal file
627
backend/venv/Lib/site-packages/nltk/tree/prettyprinter.py
Normal file
@@ -0,0 +1,627 @@
|
||||
# Natural Language Toolkit: ASCII visualization of NLTK trees
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Andreas van Cranenburgh <A.W.vanCranenburgh@uva.nl>
|
||||
# Peter Ljunglöf <peter.ljunglof@gu.se>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Pretty-printing of discontinuous trees.
|
||||
Adapted from the disco-dop project, by Andreas van Cranenburgh.
|
||||
https://github.com/andreasvc/disco-dop
|
||||
|
||||
Interesting reference (not used for this code):
|
||||
T. Eschbach et al., Orth. Hypergraph Drawing, Journal of
|
||||
Graph Algorithms and Applications, 10(2) 141--157 (2006)149.
|
||||
https://jgaa.info/accepted/2006/EschbachGuentherBecker2006.10.2.pdf
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
try:
|
||||
from html import escape
|
||||
except ImportError:
|
||||
from cgi import escape
|
||||
|
||||
from collections import defaultdict
|
||||
from operator import itemgetter
|
||||
|
||||
from nltk.tree.tree import Tree
|
||||
from nltk.util import OrderedDict
|
||||
|
||||
ANSICOLOR = {
|
||||
"black": 30,
|
||||
"red": 31,
|
||||
"green": 32,
|
||||
"yellow": 33,
|
||||
"blue": 34,
|
||||
"magenta": 35,
|
||||
"cyan": 36,
|
||||
"white": 37,
|
||||
}
|
||||
|
||||
|
||||
class TreePrettyPrinter:
|
||||
"""
|
||||
Pretty-print a tree in text format, either as ASCII or Unicode.
|
||||
The tree can be a normal tree, or discontinuous.
|
||||
|
||||
``TreePrettyPrinter(tree, sentence=None, highlight=())``
|
||||
creates an object from which different visualizations can be created.
|
||||
|
||||
:param tree: a Tree object.
|
||||
:param sentence: a list of words (strings). If `sentence` is given,
|
||||
`tree` must contain integers as leaves, which are taken as indices
|
||||
in `sentence`. Using this you can display a discontinuous tree.
|
||||
:param highlight: Optionally, a sequence of Tree objects in `tree` which
|
||||
should be highlighted. Has the effect of only applying colors to nodes
|
||||
in this sequence (nodes should be given as Tree objects, terminals as
|
||||
indices).
|
||||
|
||||
>>> from nltk.tree import Tree
|
||||
>>> tree = Tree.fromstring('(S (NP Mary) (VP walks))')
|
||||
>>> print(TreePrettyPrinter(tree).text())
|
||||
... # doctest: +NORMALIZE_WHITESPACE
|
||||
S
|
||||
____|____
|
||||
NP VP
|
||||
| |
|
||||
Mary walks
|
||||
"""
|
||||
|
||||
def __init__(self, tree, sentence=None, highlight=()):
|
||||
if sentence is None:
|
||||
leaves = tree.leaves()
|
||||
if (
|
||||
leaves
|
||||
and all(len(a) > 0 for a in tree.subtrees())
|
||||
and all(isinstance(a, int) for a in leaves)
|
||||
):
|
||||
sentence = [str(a) for a in leaves]
|
||||
else:
|
||||
# this deals with empty nodes (frontier non-terminals)
|
||||
# and multiple/mixed terminals under non-terminals.
|
||||
tree = tree.copy(True)
|
||||
sentence = []
|
||||
for a in tree.subtrees():
|
||||
if len(a) == 0:
|
||||
a.append(len(sentence))
|
||||
sentence.append(None)
|
||||
elif any(not isinstance(b, Tree) for b in a):
|
||||
for n, b in enumerate(a):
|
||||
if not isinstance(b, Tree):
|
||||
a[n] = len(sentence)
|
||||
if type(b) == tuple:
|
||||
b = "/".join(b)
|
||||
sentence.append("%s" % b)
|
||||
self.nodes, self.coords, self.edges, self.highlight = self.nodecoords(
|
||||
tree, sentence, highlight
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return self.text()
|
||||
|
||||
def __repr__(self):
|
||||
return "<TreePrettyPrinter with %d nodes>" % len(self.nodes)
|
||||
|
||||
@staticmethod
|
||||
def nodecoords(tree, sentence, highlight):
|
||||
"""
|
||||
Produce coordinates of nodes on a grid.
|
||||
|
||||
Objective:
|
||||
|
||||
- Produce coordinates for a non-overlapping placement of nodes and
|
||||
horizontal lines.
|
||||
- Order edges so that crossing edges cross a minimal number of previous
|
||||
horizontal lines (never vertical lines).
|
||||
|
||||
Approach:
|
||||
|
||||
- bottom up level order traversal (start at terminals)
|
||||
- at each level, identify nodes which cannot be on the same row
|
||||
- identify nodes which cannot be in the same column
|
||||
- place nodes into a grid at (row, column)
|
||||
- order child-parent edges with crossing edges last
|
||||
|
||||
Coordinates are (row, column); the origin (0, 0) is at the top left;
|
||||
the root node is on row 0. Coordinates do not consider the size of a
|
||||
node (which depends on font, &c), so the width of a column of the grid
|
||||
should be automatically determined by the element with the greatest
|
||||
width in that column. Alternatively, the integer coordinates could be
|
||||
converted to coordinates in which the distances between adjacent nodes
|
||||
are non-uniform.
|
||||
|
||||
Produces tuple (nodes, coords, edges, highlighted) where:
|
||||
|
||||
- nodes[id]: Tree object for the node with this integer id
|
||||
- coords[id]: (n, m) coordinate where to draw node with id in the grid
|
||||
- edges[id]: parent id of node with this id (ordered dictionary)
|
||||
- highlighted: set of ids that should be highlighted
|
||||
"""
|
||||
|
||||
def findcell(m, matrix, startoflevel, children):
|
||||
"""
|
||||
Find vacant row, column index for node ``m``.
|
||||
Iterate over current rows for this level (try lowest first)
|
||||
and look for cell between first and last child of this node,
|
||||
add new row to level if no free row available.
|
||||
"""
|
||||
candidates = [a for _, a in children[m]]
|
||||
minidx, maxidx = min(candidates), max(candidates)
|
||||
leaves = tree[m].leaves()
|
||||
center = scale * sum(leaves) // len(leaves) # center of gravity
|
||||
if minidx < maxidx and not minidx < center < maxidx:
|
||||
center = sum(candidates) // len(candidates)
|
||||
if max(candidates) - min(candidates) > 2 * scale:
|
||||
center -= center % scale # round to unscaled coordinate
|
||||
if minidx < maxidx and not minidx < center < maxidx:
|
||||
center += scale
|
||||
if ids[m] == 0:
|
||||
startoflevel = len(matrix)
|
||||
for rowidx in range(startoflevel, len(matrix) + 1):
|
||||
if rowidx == len(matrix): # need to add a new row
|
||||
matrix.append(
|
||||
[
|
||||
vertline if a not in (corner, None) else None
|
||||
for a in matrix[-1]
|
||||
]
|
||||
)
|
||||
row = matrix[rowidx]
|
||||
if len(children[m]) == 1: # place unaries directly above child
|
||||
return rowidx, next(iter(children[m]))[1]
|
||||
elif all(
|
||||
a is None or a == vertline
|
||||
for a in row[min(candidates) : max(candidates) + 1]
|
||||
):
|
||||
# find free column
|
||||
for n in range(scale):
|
||||
i = j = center + n
|
||||
while j > minidx or i < maxidx:
|
||||
if i < maxidx and (
|
||||
matrix[rowidx][i] is None or i in candidates
|
||||
):
|
||||
return rowidx, i
|
||||
elif j > minidx and (
|
||||
matrix[rowidx][j] is None or j in candidates
|
||||
):
|
||||
return rowidx, j
|
||||
i += scale
|
||||
j -= scale
|
||||
raise ValueError(
|
||||
"could not find a free cell for:\n%s\n%s"
|
||||
"min=%d; max=%d" % (tree[m], minidx, maxidx, dumpmatrix())
|
||||
)
|
||||
|
||||
def dumpmatrix():
|
||||
"""Dump matrix contents for debugging purposes."""
|
||||
return "\n".join(
|
||||
"%2d: %s" % (n, " ".join(("%2r" % i)[:2] for i in row))
|
||||
for n, row in enumerate(matrix)
|
||||
)
|
||||
|
||||
leaves = tree.leaves()
|
||||
if not all(isinstance(n, int) for n in leaves):
|
||||
raise ValueError("All leaves must be integer indices.")
|
||||
if len(leaves) != len(set(leaves)):
|
||||
raise ValueError("Indices must occur at most once.")
|
||||
if not all(0 <= n < len(sentence) for n in leaves):
|
||||
raise ValueError(
|
||||
"All leaves must be in the interval 0..n "
|
||||
"with n=len(sentence)\ntokens: %d indices: "
|
||||
"%r\nsentence: %s" % (len(sentence), tree.leaves(), sentence)
|
||||
)
|
||||
vertline, corner = -1, -2 # constants
|
||||
tree = tree.copy(True)
|
||||
for a in tree.subtrees():
|
||||
a.sort(key=lambda n: min(n.leaves()) if isinstance(n, Tree) else n)
|
||||
scale = 2
|
||||
crossed = set()
|
||||
# internal nodes and lexical nodes (no frontiers)
|
||||
positions = tree.treepositions()
|
||||
maxdepth = max(map(len, positions)) + 1
|
||||
childcols = defaultdict(set)
|
||||
matrix = [[None] * (len(sentence) * scale)]
|
||||
nodes = {}
|
||||
ids = {a: n for n, a in enumerate(positions)}
|
||||
highlighted_nodes = {
|
||||
n for a, n in ids.items() if not highlight or tree[a] in highlight
|
||||
}
|
||||
levels = {n: [] for n in range(maxdepth - 1)}
|
||||
terminals = []
|
||||
for a in positions:
|
||||
node = tree[a]
|
||||
if isinstance(node, Tree):
|
||||
levels[maxdepth - node.height()].append(a)
|
||||
else:
|
||||
terminals.append(a)
|
||||
|
||||
for n in levels:
|
||||
levels[n].sort(key=lambda n: max(tree[n].leaves()) - min(tree[n].leaves()))
|
||||
terminals.sort()
|
||||
positions = set(positions)
|
||||
|
||||
for m in terminals:
|
||||
i = int(tree[m]) * scale
|
||||
assert matrix[0][i] is None, (matrix[0][i], m, i)
|
||||
matrix[0][i] = ids[m]
|
||||
nodes[ids[m]] = sentence[tree[m]]
|
||||
if nodes[ids[m]] is None:
|
||||
nodes[ids[m]] = "..."
|
||||
highlighted_nodes.discard(ids[m])
|
||||
positions.remove(m)
|
||||
childcols[m[:-1]].add((0, i))
|
||||
|
||||
# add other nodes centered on their children,
|
||||
# if the center is already taken, back off
|
||||
# to the left and right alternately, until an empty cell is found.
|
||||
for n in sorted(levels, reverse=True):
|
||||
nodesatdepth = levels[n]
|
||||
startoflevel = len(matrix)
|
||||
matrix.append(
|
||||
[vertline if a not in (corner, None) else None for a in matrix[-1]]
|
||||
)
|
||||
for m in nodesatdepth: # [::-1]:
|
||||
if n < maxdepth - 1 and childcols[m]:
|
||||
_, pivot = min(childcols[m], key=itemgetter(1))
|
||||
if {
|
||||
a[:-1]
|
||||
for row in matrix[:-1]
|
||||
for a in row[:pivot]
|
||||
if isinstance(a, tuple)
|
||||
} & {
|
||||
a[:-1]
|
||||
for row in matrix[:-1]
|
||||
for a in row[pivot:]
|
||||
if isinstance(a, tuple)
|
||||
}:
|
||||
crossed.add(m)
|
||||
|
||||
rowidx, i = findcell(m, matrix, startoflevel, childcols)
|
||||
positions.remove(m)
|
||||
|
||||
# block positions where children of this node branch out
|
||||
for _, x in childcols[m]:
|
||||
matrix[rowidx][x] = corner
|
||||
# assert m == () or matrix[rowidx][i] in (None, corner), (
|
||||
# matrix[rowidx][i], m, str(tree), ' '.join(sentence))
|
||||
# node itself
|
||||
matrix[rowidx][i] = ids[m]
|
||||
nodes[ids[m]] = tree[m]
|
||||
# add column to the set of children for its parent
|
||||
if len(m) > 0:
|
||||
childcols[m[:-1]].add((rowidx, i))
|
||||
assert len(positions) == 0
|
||||
|
||||
# remove unused columns, right to left
|
||||
for m in range(scale * len(sentence) - 1, -1, -1):
|
||||
if not any(isinstance(row[m], (Tree, int)) for row in matrix):
|
||||
for row in matrix:
|
||||
del row[m]
|
||||
|
||||
# remove unused rows, reverse
|
||||
matrix = [
|
||||
row
|
||||
for row in reversed(matrix)
|
||||
if not all(a is None or a == vertline for a in row)
|
||||
]
|
||||
|
||||
# collect coordinates of nodes
|
||||
coords = {}
|
||||
for n, _ in enumerate(matrix):
|
||||
for m, i in enumerate(matrix[n]):
|
||||
if isinstance(i, int) and i >= 0:
|
||||
coords[i] = n, m
|
||||
|
||||
# move crossed edges last
|
||||
positions = sorted(
|
||||
(a for level in levels.values() for a in level),
|
||||
key=lambda a: a[:-1] in crossed,
|
||||
)
|
||||
|
||||
# collect edges from node to node
|
||||
edges = OrderedDict()
|
||||
for i in reversed(positions):
|
||||
for j, _ in enumerate(tree[i]):
|
||||
edges[ids[i + (j,)]] = ids[i]
|
||||
|
||||
return nodes, coords, edges, highlighted_nodes
|
||||
|
||||
def text(
|
||||
self,
|
||||
nodedist=1,
|
||||
unicodelines=False,
|
||||
html=False,
|
||||
ansi=False,
|
||||
nodecolor="blue",
|
||||
leafcolor="red",
|
||||
funccolor="green",
|
||||
abbreviate=None,
|
||||
maxwidth=16,
|
||||
):
|
||||
"""
|
||||
:return: ASCII art for a discontinuous tree.
|
||||
|
||||
:param unicodelines: whether to use Unicode line drawing characters
|
||||
instead of plain (7-bit) ASCII.
|
||||
:param html: whether to wrap output in html code (default plain text).
|
||||
:param ansi: whether to produce colors with ANSI escape sequences
|
||||
(only effective when html==False).
|
||||
:param leafcolor, nodecolor: specify colors of leaves and phrasal
|
||||
nodes; effective when either html or ansi is True.
|
||||
:param abbreviate: if True, abbreviate labels longer than 5 characters.
|
||||
If integer, abbreviate labels longer than `abbr` characters.
|
||||
:param maxwidth: maximum number of characters before a label starts to
|
||||
wrap; pass None to disable.
|
||||
"""
|
||||
if abbreviate == True:
|
||||
abbreviate = 5
|
||||
if unicodelines:
|
||||
horzline = "\u2500"
|
||||
leftcorner = "\u250c"
|
||||
rightcorner = "\u2510"
|
||||
vertline = " \u2502 "
|
||||
tee = horzline + "\u252C" + horzline
|
||||
bottom = horzline + "\u2534" + horzline
|
||||
cross = horzline + "\u253c" + horzline
|
||||
ellipsis = "\u2026"
|
||||
else:
|
||||
horzline = "_"
|
||||
leftcorner = rightcorner = " "
|
||||
vertline = " | "
|
||||
tee = 3 * horzline
|
||||
cross = bottom = "_|_"
|
||||
ellipsis = "."
|
||||
|
||||
def crosscell(cur, x=vertline):
|
||||
"""Overwrite center of this cell with a vertical branch."""
|
||||
splitl = len(cur) - len(cur) // 2 - len(x) // 2 - 1
|
||||
lst = list(cur)
|
||||
lst[splitl : splitl + len(x)] = list(x)
|
||||
return "".join(lst)
|
||||
|
||||
result = []
|
||||
matrix = defaultdict(dict)
|
||||
maxnodewith = defaultdict(lambda: 3)
|
||||
maxnodeheight = defaultdict(lambda: 1)
|
||||
maxcol = 0
|
||||
minchildcol = {}
|
||||
maxchildcol = {}
|
||||
childcols = defaultdict(set)
|
||||
labels = {}
|
||||
wrapre = re.compile(
|
||||
"(.{%d,%d}\\b\\W*|.{%d})" % (maxwidth - 4, maxwidth, maxwidth)
|
||||
)
|
||||
# collect labels and coordinates
|
||||
for a in self.nodes:
|
||||
row, column = self.coords[a]
|
||||
matrix[row][column] = a
|
||||
maxcol = max(maxcol, column)
|
||||
label = (
|
||||
self.nodes[a].label()
|
||||
if isinstance(self.nodes[a], Tree)
|
||||
else self.nodes[a]
|
||||
)
|
||||
if abbreviate and len(label) > abbreviate:
|
||||
label = label[:abbreviate] + ellipsis
|
||||
if maxwidth and len(label) > maxwidth:
|
||||
label = wrapre.sub(r"\1\n", label).strip()
|
||||
label = label.split("\n")
|
||||
maxnodeheight[row] = max(maxnodeheight[row], len(label))
|
||||
maxnodewith[column] = max(maxnodewith[column], max(map(len, label)))
|
||||
labels[a] = label
|
||||
if a not in self.edges:
|
||||
continue # e.g., root
|
||||
parent = self.edges[a]
|
||||
childcols[parent].add((row, column))
|
||||
minchildcol[parent] = min(minchildcol.get(parent, column), column)
|
||||
maxchildcol[parent] = max(maxchildcol.get(parent, column), column)
|
||||
# bottom up level order traversal
|
||||
for row in sorted(matrix, reverse=True):
|
||||
noderows = [
|
||||
["".center(maxnodewith[col]) for col in range(maxcol + 1)]
|
||||
for _ in range(maxnodeheight[row])
|
||||
]
|
||||
branchrow = ["".center(maxnodewith[col]) for col in range(maxcol + 1)]
|
||||
for col in matrix[row]:
|
||||
n = matrix[row][col]
|
||||
node = self.nodes[n]
|
||||
text = labels[n]
|
||||
if isinstance(node, Tree):
|
||||
# draw horizontal branch towards children for this node
|
||||
if n in minchildcol and minchildcol[n] < maxchildcol[n]:
|
||||
i, j = minchildcol[n], maxchildcol[n]
|
||||
a, b = (maxnodewith[i] + 1) // 2 - 1, maxnodewith[j] // 2
|
||||
branchrow[i] = ((" " * a) + leftcorner).ljust(
|
||||
maxnodewith[i], horzline
|
||||
)
|
||||
branchrow[j] = (rightcorner + (" " * b)).rjust(
|
||||
maxnodewith[j], horzline
|
||||
)
|
||||
for i in range(minchildcol[n] + 1, maxchildcol[n]):
|
||||
if i == col and any(a == i for _, a in childcols[n]):
|
||||
line = cross
|
||||
elif i == col:
|
||||
line = bottom
|
||||
elif any(a == i for _, a in childcols[n]):
|
||||
line = tee
|
||||
else:
|
||||
line = horzline
|
||||
branchrow[i] = line.center(maxnodewith[i], horzline)
|
||||
else: # if n and n in minchildcol:
|
||||
branchrow[col] = crosscell(branchrow[col])
|
||||
text = [a.center(maxnodewith[col]) for a in text]
|
||||
color = nodecolor if isinstance(node, Tree) else leafcolor
|
||||
if isinstance(node, Tree) and node.label().startswith("-"):
|
||||
color = funccolor
|
||||
if html:
|
||||
text = [escape(a, quote=False) for a in text]
|
||||
if n in self.highlight:
|
||||
text = [f"<font color={color}>{a}</font>" for a in text]
|
||||
elif ansi and n in self.highlight:
|
||||
text = ["\x1b[%d;1m%s\x1b[0m" % (ANSICOLOR[color], a) for a in text]
|
||||
for x in range(maxnodeheight[row]):
|
||||
# draw vertical lines in partially filled multiline node
|
||||
# labels, but only if it's not a frontier node.
|
||||
noderows[x][col] = (
|
||||
text[x]
|
||||
if x < len(text)
|
||||
else (vertline if childcols[n] else " ").center(
|
||||
maxnodewith[col], " "
|
||||
)
|
||||
)
|
||||
# for each column, if there is a node below us which has a parent
|
||||
# above us, draw a vertical branch in that column.
|
||||
if row != max(matrix):
|
||||
for n, (childrow, col) in self.coords.items():
|
||||
if n > 0 and self.coords[self.edges[n]][0] < row < childrow:
|
||||
branchrow[col] = crosscell(branchrow[col])
|
||||
if col not in matrix[row]:
|
||||
for noderow in noderows:
|
||||
noderow[col] = crosscell(noderow[col])
|
||||
branchrow = [
|
||||
a + ((a[-1] if a[-1] != " " else b[0]) * nodedist)
|
||||
for a, b in zip(branchrow, branchrow[1:] + [" "])
|
||||
]
|
||||
result.append("".join(branchrow))
|
||||
result.extend(
|
||||
(" " * nodedist).join(noderow) for noderow in reversed(noderows)
|
||||
)
|
||||
return "\n".join(reversed(result)) + "\n"
|
||||
|
||||
def svg(self, nodecolor="blue", leafcolor="red", funccolor="green"):
|
||||
"""
|
||||
:return: SVG representation of a tree.
|
||||
"""
|
||||
fontsize = 12
|
||||
hscale = 40
|
||||
vscale = 25
|
||||
hstart = vstart = 20
|
||||
width = max(col for _, col in self.coords.values())
|
||||
height = max(row for row, _ in self.coords.values())
|
||||
result = [
|
||||
'<svg version="1.1" xmlns="http://www.w3.org/2000/svg" '
|
||||
'width="%dem" height="%dem" viewBox="%d %d %d %d">'
|
||||
% (
|
||||
width * 3,
|
||||
height * 2.5,
|
||||
-hstart,
|
||||
-vstart,
|
||||
width * hscale + 3 * hstart,
|
||||
height * vscale + 3 * vstart,
|
||||
)
|
||||
]
|
||||
|
||||
children = defaultdict(set)
|
||||
for n in self.nodes:
|
||||
if n:
|
||||
children[self.edges[n]].add(n)
|
||||
|
||||
# horizontal branches from nodes to children
|
||||
for node in self.nodes:
|
||||
if not children[node]:
|
||||
continue
|
||||
y, x = self.coords[node]
|
||||
x *= hscale
|
||||
y *= vscale
|
||||
x += hstart
|
||||
y += vstart + fontsize // 2
|
||||
childx = [self.coords[c][1] for c in children[node]]
|
||||
xmin = hstart + hscale * min(childx)
|
||||
xmax = hstart + hscale * max(childx)
|
||||
result.append(
|
||||
'\t<polyline style="stroke:black; stroke-width:1; fill:none;" '
|
||||
'points="%g,%g %g,%g" />' % (xmin, y, xmax, y)
|
||||
)
|
||||
result.append(
|
||||
'\t<polyline style="stroke:black; stroke-width:1; fill:none;" '
|
||||
'points="%g,%g %g,%g" />' % (x, y, x, y - fontsize // 3)
|
||||
)
|
||||
|
||||
# vertical branches from children to parents
|
||||
for child, parent in self.edges.items():
|
||||
y, _ = self.coords[parent]
|
||||
y *= vscale
|
||||
y += vstart + fontsize // 2
|
||||
childy, childx = self.coords[child]
|
||||
childx *= hscale
|
||||
childy *= vscale
|
||||
childx += hstart
|
||||
childy += vstart - fontsize
|
||||
result += [
|
||||
'\t<polyline style="stroke:white; stroke-width:10; fill:none;"'
|
||||
' points="%g,%g %g,%g" />' % (childx, childy, childx, y + 5),
|
||||
'\t<polyline style="stroke:black; stroke-width:1; fill:none;"'
|
||||
' points="%g,%g %g,%g" />' % (childx, childy, childx, y),
|
||||
]
|
||||
|
||||
# write nodes with coordinates
|
||||
for n, (row, column) in self.coords.items():
|
||||
node = self.nodes[n]
|
||||
x = column * hscale + hstart
|
||||
y = row * vscale + vstart
|
||||
if n in self.highlight:
|
||||
color = nodecolor if isinstance(node, Tree) else leafcolor
|
||||
if isinstance(node, Tree) and node.label().startswith("-"):
|
||||
color = funccolor
|
||||
else:
|
||||
color = "black"
|
||||
result += [
|
||||
'\t<text style="text-anchor: middle; fill: %s; '
|
||||
'font-size: %dpx;" x="%g" y="%g">%s</text>'
|
||||
% (
|
||||
color,
|
||||
fontsize,
|
||||
x,
|
||||
y,
|
||||
escape(
|
||||
node.label() if isinstance(node, Tree) else node, quote=False
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
result += ["</svg>"]
|
||||
return "\n".join(result)
|
||||
|
||||
|
||||
def test():
|
||||
"""Do some tree drawing tests."""
|
||||
|
||||
def print_tree(n, tree, sentence=None, ansi=True, **xargs):
|
||||
print()
|
||||
print('{}: "{}"'.format(n, " ".join(sentence or tree.leaves())))
|
||||
print(tree)
|
||||
print()
|
||||
drawtree = TreePrettyPrinter(tree, sentence)
|
||||
try:
|
||||
print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs))
|
||||
except (UnicodeDecodeError, UnicodeEncodeError):
|
||||
print(drawtree.text(unicodelines=False, ansi=False, **xargs))
|
||||
|
||||
from nltk.corpus import treebank
|
||||
|
||||
for n in [0, 1440, 1591, 2771, 2170]:
|
||||
tree = treebank.parsed_sents()[n]
|
||||
print_tree(n, tree, nodedist=2, maxwidth=8)
|
||||
print()
|
||||
print("ASCII version:")
|
||||
print(TreePrettyPrinter(tree).text(nodedist=2))
|
||||
|
||||
tree = Tree.fromstring(
|
||||
"(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) "
|
||||
"(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) "
|
||||
"(vg 10) (inf (verb 11)))))) (punct 12))",
|
||||
read_leaf=int,
|
||||
)
|
||||
sentence = (
|
||||
"Ze had met haar moeder kunnen gaan winkelen ,"
|
||||
" zwemmen of terrassen .".split()
|
||||
)
|
||||
print_tree("Discontinuous tree", tree, sentence, nodedist=2)
|
||||
|
||||
|
||||
__all__ = ["TreePrettyPrinter"]
|
||||
|
||||
if __name__ == "__main__":
|
||||
test()
|
||||
74
backend/venv/Lib/site-packages/nltk/tree/probabilistic.py
Normal file
74
backend/venv/Lib/site-packages/nltk/tree/probabilistic.py
Normal file
@@ -0,0 +1,74 @@
|
||||
# Natural Language Toolkit: Text Trees
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Peter Ljunglöf <peter.ljunglof@gu.se>
|
||||
# Tom Aarsen <>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
from nltk.internals import raise_unorderable_types
|
||||
from nltk.probability import ProbabilisticMixIn
|
||||
from nltk.tree.immutable import ImmutableProbabilisticTree
|
||||
from nltk.tree.tree import Tree
|
||||
|
||||
######################################################################
|
||||
## Probabilistic trees
|
||||
######################################################################
|
||||
|
||||
|
||||
class ProbabilisticTree(Tree, ProbabilisticMixIn):
|
||||
def __init__(self, node, children=None, **prob_kwargs):
|
||||
Tree.__init__(self, node, children)
|
||||
ProbabilisticMixIn.__init__(self, **prob_kwargs)
|
||||
|
||||
# We have to patch up these methods to make them work right:
|
||||
def _frozen_class(self):
|
||||
return ImmutableProbabilisticTree
|
||||
|
||||
def __repr__(self):
|
||||
return f"{Tree.__repr__(self)} (p={self.prob()!r})"
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.pformat(margin=60)} (p={self.prob():.6g})"
|
||||
|
||||
def copy(self, deep=False):
|
||||
if not deep:
|
||||
return type(self)(self._label, self, prob=self.prob())
|
||||
else:
|
||||
return type(self).convert(self)
|
||||
|
||||
@classmethod
|
||||
def convert(cls, val):
|
||||
if isinstance(val, Tree):
|
||||
children = [cls.convert(child) for child in val]
|
||||
if isinstance(val, ProbabilisticMixIn):
|
||||
return cls(val._label, children, prob=val.prob())
|
||||
else:
|
||||
return cls(val._label, children, prob=1.0)
|
||||
else:
|
||||
return val
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.__class__ is other.__class__ and (
|
||||
self._label,
|
||||
list(self),
|
||||
self.prob(),
|
||||
) == (other._label, list(other), other.prob())
|
||||
|
||||
def __lt__(self, other):
|
||||
if not isinstance(other, Tree):
|
||||
raise_unorderable_types("<", self, other)
|
||||
if self.__class__ is other.__class__:
|
||||
return (self._label, list(self), self.prob()) < (
|
||||
other._label,
|
||||
list(other),
|
||||
other.prob(),
|
||||
)
|
||||
else:
|
||||
return self.__class__.__name__ < other.__class__.__name__
|
||||
|
||||
|
||||
__all__ = ["ProbabilisticTree"]
|
||||
337
backend/venv/Lib/site-packages/nltk/tree/transforms.py
Normal file
337
backend/venv/Lib/site-packages/nltk/tree/transforms.py
Normal file
@@ -0,0 +1,337 @@
|
||||
# Natural Language Toolkit: Tree Transformations
|
||||
#
|
||||
# Copyright (C) 2005-2007 Oregon Graduate Institute
|
||||
# Author: Nathan Bodenstab <bodenstab@cslu.ogi.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
r"""
|
||||
A collection of methods for tree (grammar) transformations used
|
||||
in parsing natural language.
|
||||
|
||||
Although many of these methods are technically grammar transformations
|
||||
(ie. Chomsky Norm Form), when working with treebanks it is much more
|
||||
natural to visualize these modifications in a tree structure. Hence,
|
||||
we will do all transformation directly to the tree itself.
|
||||
Transforming the tree directly also allows us to do parent annotation.
|
||||
A grammar can then be simply induced from the modified tree.
|
||||
|
||||
The following is a short tutorial on the available transformations.
|
||||
|
||||
1. Chomsky Normal Form (binarization)
|
||||
|
||||
It is well known that any grammar has a Chomsky Normal Form (CNF)
|
||||
equivalent grammar where CNF is defined by every production having
|
||||
either two non-terminals or one terminal on its right hand side.
|
||||
When we have hierarchically structured data (ie. a treebank), it is
|
||||
natural to view this in terms of productions where the root of every
|
||||
subtree is the head (left hand side) of the production and all of
|
||||
its children are the right hand side constituents. In order to
|
||||
convert a tree into CNF, we simply need to ensure that every subtree
|
||||
has either two subtrees as children (binarization), or one leaf node
|
||||
(non-terminal). In order to binarize a subtree with more than two
|
||||
children, we must introduce artificial nodes.
|
||||
|
||||
There are two popular methods to convert a tree into CNF: left
|
||||
factoring and right factoring. The following example demonstrates
|
||||
the difference between them. Example::
|
||||
|
||||
Original Right-Factored Left-Factored
|
||||
|
||||
A A A
|
||||
/ | \ / \ / \
|
||||
B C D ==> B A|<C-D> OR A|<B-C> D
|
||||
/ \ / \
|
||||
C D B C
|
||||
|
||||
2. Parent Annotation
|
||||
|
||||
In addition to binarizing the tree, there are two standard
|
||||
modifications to node labels we can do in the same traversal: parent
|
||||
annotation and Markov order-N smoothing (or sibling smoothing).
|
||||
|
||||
The purpose of parent annotation is to refine the probabilities of
|
||||
productions by adding a small amount of context. With this simple
|
||||
addition, a CYK (inside-outside, dynamic programming chart parse)
|
||||
can improve from 74% to 79% accuracy. A natural generalization from
|
||||
parent annotation is to grandparent annotation and beyond. The
|
||||
tradeoff becomes accuracy gain vs. computational complexity. We
|
||||
must also keep in mind data sparcity issues. Example::
|
||||
|
||||
Original Parent Annotation
|
||||
|
||||
A A^<?>
|
||||
/ | \ / \
|
||||
B C D ==> B^<A> A|<C-D>^<?> where ? is the
|
||||
/ \ parent of A
|
||||
C^<A> D^<A>
|
||||
|
||||
|
||||
3. Markov order-N smoothing
|
||||
|
||||
Markov smoothing combats data sparcity issues as well as decreasing
|
||||
computational requirements by limiting the number of children
|
||||
included in artificial nodes. In practice, most people use an order
|
||||
2 grammar. Example::
|
||||
|
||||
Original No Smoothing Markov order 1 Markov order 2 etc.
|
||||
|
||||
__A__ A A A
|
||||
/ /|\ \ / \ / \ / \
|
||||
B C D E F ==> B A|<C-D-E-F> ==> B A|<C> ==> B A|<C-D>
|
||||
/ \ / \ / \
|
||||
C ... C ... C ...
|
||||
|
||||
|
||||
|
||||
Annotation decisions can be thought about in the vertical direction
|
||||
(parent, grandparent, etc) and the horizontal direction (number of
|
||||
siblings to keep). Parameters to the following functions specify
|
||||
these values. For more information see:
|
||||
|
||||
Dan Klein and Chris Manning (2003) "Accurate Unlexicalized
|
||||
Parsing", ACL-03. https://www.aclweb.org/anthology/P03-1054
|
||||
|
||||
4. Unary Collapsing
|
||||
|
||||
Collapse unary productions (ie. subtrees with a single child) into a
|
||||
new non-terminal (Tree node). This is useful when working with
|
||||
algorithms that do not allow unary productions, yet you do not wish
|
||||
to lose the parent information. Example::
|
||||
|
||||
A
|
||||
|
|
||||
B ==> A+B
|
||||
/ \ / \
|
||||
C D C D
|
||||
|
||||
"""
|
||||
|
||||
from nltk.tree.tree import Tree
|
||||
|
||||
|
||||
def chomsky_normal_form(
|
||||
tree, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^"
|
||||
):
|
||||
# assume all subtrees have homogeneous children
|
||||
# assume all terminals have no siblings
|
||||
|
||||
# A semi-hack to have elegant looking code below. As a result,
|
||||
# any subtree with a branching factor greater than 999 will be incorrectly truncated.
|
||||
if horzMarkov is None:
|
||||
horzMarkov = 999
|
||||
|
||||
# Traverse the tree depth-first keeping a list of ancestor nodes to the root.
|
||||
# I chose not to use the tree.treepositions() method since it requires
|
||||
# two traversals of the tree (one to get the positions, one to iterate
|
||||
# over them) and node access time is proportional to the height of the node.
|
||||
# This method is 7x faster which helps when parsing 40,000 sentences.
|
||||
|
||||
nodeList = [(tree, [tree.label()])]
|
||||
while nodeList != []:
|
||||
node, parent = nodeList.pop()
|
||||
if isinstance(node, Tree):
|
||||
# parent annotation
|
||||
parentString = ""
|
||||
originalNode = node.label()
|
||||
if vertMarkov != 0 and node != tree and isinstance(node[0], Tree):
|
||||
parentString = "{}<{}>".format(parentChar, "-".join(parent))
|
||||
node.set_label(node.label() + parentString)
|
||||
parent = [originalNode] + parent[: vertMarkov - 1]
|
||||
|
||||
# add children to the agenda before we mess with them
|
||||
for child in node:
|
||||
nodeList.append((child, parent))
|
||||
|
||||
# chomsky normal form factorization
|
||||
if len(node) > 2:
|
||||
childNodes = [child.label() for child in node]
|
||||
nodeCopy = node.copy()
|
||||
node[0:] = [] # delete the children
|
||||
|
||||
curNode = node
|
||||
numChildren = len(nodeCopy)
|
||||
for i in range(1, numChildren - 1):
|
||||
if factor == "right":
|
||||
newHead = "{}{}<{}>{}".format(
|
||||
originalNode,
|
||||
childChar,
|
||||
"-".join(
|
||||
childNodes[i : min([i + horzMarkov, numChildren])]
|
||||
),
|
||||
parentString,
|
||||
) # create new head
|
||||
newNode = Tree(newHead, [])
|
||||
curNode[0:] = [nodeCopy.pop(0), newNode]
|
||||
else:
|
||||
newHead = "{}{}<{}>{}".format(
|
||||
originalNode,
|
||||
childChar,
|
||||
"-".join(
|
||||
childNodes[max([numChildren - i - horzMarkov, 0]) : -i]
|
||||
),
|
||||
parentString,
|
||||
)
|
||||
newNode = Tree(newHead, [])
|
||||
curNode[0:] = [newNode, nodeCopy.pop()]
|
||||
|
||||
curNode = newNode
|
||||
|
||||
curNode[0:] = [child for child in nodeCopy]
|
||||
|
||||
|
||||
def un_chomsky_normal_form(
|
||||
tree, expandUnary=True, childChar="|", parentChar="^", unaryChar="+"
|
||||
):
|
||||
# Traverse the tree-depth first keeping a pointer to the parent for modification purposes.
|
||||
nodeList = [(tree, [])]
|
||||
while nodeList != []:
|
||||
node, parent = nodeList.pop()
|
||||
if isinstance(node, Tree):
|
||||
# if the node contains the 'childChar' character it means that
|
||||
# it is an artificial node and can be removed, although we still need
|
||||
# to move its children to its parent
|
||||
childIndex = node.label().find(childChar)
|
||||
if childIndex != -1:
|
||||
nodeIndex = parent.index(node)
|
||||
parent.remove(parent[nodeIndex])
|
||||
# Generated node was on the left if the nodeIndex is 0 which
|
||||
# means the grammar was left factored. We must insert the children
|
||||
# at the beginning of the parent's children
|
||||
if nodeIndex == 0:
|
||||
parent.insert(0, node[0])
|
||||
parent.insert(1, node[1])
|
||||
else:
|
||||
parent.extend([node[0], node[1]])
|
||||
|
||||
# parent is now the current node so the children of parent will be added to the agenda
|
||||
node = parent
|
||||
else:
|
||||
parentIndex = node.label().find(parentChar)
|
||||
if parentIndex != -1:
|
||||
# strip the node name of the parent annotation
|
||||
node.set_label(node.label()[:parentIndex])
|
||||
|
||||
# expand collapsed unary productions
|
||||
if expandUnary == True:
|
||||
unaryIndex = node.label().find(unaryChar)
|
||||
if unaryIndex != -1:
|
||||
newNode = Tree(
|
||||
node.label()[unaryIndex + 1 :], [i for i in node]
|
||||
)
|
||||
node.set_label(node.label()[:unaryIndex])
|
||||
node[0:] = [newNode]
|
||||
|
||||
for child in node:
|
||||
nodeList.append((child, node))
|
||||
|
||||
|
||||
def collapse_unary(tree, collapsePOS=False, collapseRoot=False, joinChar="+"):
|
||||
"""
|
||||
Collapse subtrees with a single child (ie. unary productions)
|
||||
into a new non-terminal (Tree node) joined by 'joinChar'.
|
||||
This is useful when working with algorithms that do not allow
|
||||
unary productions, and completely removing the unary productions
|
||||
would require loss of useful information. The Tree is modified
|
||||
directly (since it is passed by reference) and no value is returned.
|
||||
|
||||
:param tree: The Tree to be collapsed
|
||||
:type tree: Tree
|
||||
:param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie.
|
||||
Part-of-Speech tags) since they are always unary productions
|
||||
:type collapsePOS: bool
|
||||
:param collapseRoot: 'False' (default) will not modify the root production
|
||||
if it is unary. For the Penn WSJ treebank corpus, this corresponds
|
||||
to the TOP -> productions.
|
||||
:type collapseRoot: bool
|
||||
:param joinChar: A string used to connect collapsed node values (default = "+")
|
||||
:type joinChar: str
|
||||
"""
|
||||
|
||||
if collapseRoot == False and isinstance(tree, Tree) and len(tree) == 1:
|
||||
nodeList = [tree[0]]
|
||||
else:
|
||||
nodeList = [tree]
|
||||
|
||||
# depth-first traversal of tree
|
||||
while nodeList != []:
|
||||
node = nodeList.pop()
|
||||
if isinstance(node, Tree):
|
||||
if (
|
||||
len(node) == 1
|
||||
and isinstance(node[0], Tree)
|
||||
and (collapsePOS == True or isinstance(node[0, 0], Tree))
|
||||
):
|
||||
node.set_label(node.label() + joinChar + node[0].label())
|
||||
node[0:] = [child for child in node[0]]
|
||||
# since we assigned the child's children to the current node,
|
||||
# evaluate the current node again
|
||||
nodeList.append(node)
|
||||
else:
|
||||
for child in node:
|
||||
nodeList.append(child)
|
||||
|
||||
|
||||
#################################################################
|
||||
# Demonstration
|
||||
#################################################################
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
A demonstration showing how each tree transform can be used.
|
||||
"""
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
from nltk.draw.tree import draw_trees
|
||||
from nltk.tree.tree import Tree
|
||||
|
||||
# original tree from WSJ bracketed text
|
||||
sentence = """(TOP
|
||||
(S
|
||||
(S
|
||||
(VP
|
||||
(VBN Turned)
|
||||
(ADVP (RB loose))
|
||||
(PP
|
||||
(IN in)
|
||||
(NP
|
||||
(NP (NNP Shane) (NNP Longman) (POS 's))
|
||||
(NN trading)
|
||||
(NN room)))))
|
||||
(, ,)
|
||||
(NP (DT the) (NN yuppie) (NNS dealers))
|
||||
(VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))
|
||||
(. .)))"""
|
||||
t = Tree.fromstring(sentence, remove_empty_top_bracketing=True)
|
||||
|
||||
# collapse subtrees with only one child
|
||||
collapsedTree = deepcopy(t)
|
||||
collapse_unary(collapsedTree)
|
||||
|
||||
# convert the tree to CNF
|
||||
cnfTree = deepcopy(collapsedTree)
|
||||
chomsky_normal_form(cnfTree)
|
||||
|
||||
# convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two
|
||||
parentTree = deepcopy(collapsedTree)
|
||||
chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1)
|
||||
|
||||
# convert the tree back to its original form (used to make CYK results comparable)
|
||||
original = deepcopy(parentTree)
|
||||
un_chomsky_normal_form(original)
|
||||
|
||||
# convert tree back to bracketed text
|
||||
sentence2 = original.pprint()
|
||||
print(sentence)
|
||||
print(sentence2)
|
||||
print("Sentences the same? ", sentence == sentence2)
|
||||
|
||||
draw_trees(t, collapsedTree, cnfTree, parentTree, original)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
|
||||
__all__ = ["chomsky_normal_form", "un_chomsky_normal_form", "collapse_unary"]
|
||||
982
backend/venv/Lib/site-packages/nltk/tree/tree.py
Normal file
982
backend/venv/Lib/site-packages/nltk/tree/tree.py
Normal file
@@ -0,0 +1,982 @@
|
||||
# Natural Language Toolkit: Text Trees
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Peter Ljunglöf <peter.ljunglof@gu.se>
|
||||
# Nathan Bodenstab <bodenstab@cslu.ogi.edu> (tree transforms)
|
||||
# Eric Kafe <kafe.eric@gmail.com> (Tree.fromlist())
|
||||
# Mohaned mashaly<mohaned.mashaly12@gmail.com> (Deprecating methods)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Class for representing hierarchical language structures, such as
|
||||
syntax trees and morphological trees.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from nltk.grammar import Nonterminal, Production
|
||||
from nltk.internals import deprecated
|
||||
|
||||
######################################################################
|
||||
## Trees
|
||||
######################################################################
|
||||
|
||||
|
||||
class Tree(list):
|
||||
r"""
|
||||
A Tree represents a hierarchical grouping of leaves and subtrees.
|
||||
For example, each constituent in a syntax tree is represented by a single Tree.
|
||||
|
||||
A tree's children are encoded as a list of leaves and subtrees,
|
||||
where a leaf is a basic (non-tree) value; and a subtree is a
|
||||
nested Tree.
|
||||
|
||||
>>> from nltk.tree import Tree
|
||||
>>> print(Tree(1, [2, Tree(3, [4]), 5]))
|
||||
(1 2 (3 4) 5)
|
||||
>>> vp = Tree('VP', [Tree('V', ['saw']),
|
||||
... Tree('NP', ['him'])])
|
||||
>>> s = Tree('S', [Tree('NP', ['I']), vp])
|
||||
>>> print(s)
|
||||
(S (NP I) (VP (V saw) (NP him)))
|
||||
>>> print(s[1])
|
||||
(VP (V saw) (NP him))
|
||||
>>> print(s[1,1])
|
||||
(NP him)
|
||||
>>> t = Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))")
|
||||
>>> s == t
|
||||
True
|
||||
>>> t[1][1].set_label('X')
|
||||
>>> t[1][1].label()
|
||||
'X'
|
||||
>>> print(t)
|
||||
(S (NP I) (VP (V saw) (X him)))
|
||||
>>> t[0], t[1,1] = t[1,1], t[0]
|
||||
>>> print(t)
|
||||
(S (X him) (VP (V saw) (NP I)))
|
||||
|
||||
The length of a tree is the number of children it has.
|
||||
|
||||
>>> len(t)
|
||||
2
|
||||
|
||||
The set_label() and label() methods allow individual constituents
|
||||
to be labeled. For example, syntax trees use this label to specify
|
||||
phrase tags, such as "NP" and "VP".
|
||||
|
||||
Several Tree methods use "tree positions" to specify
|
||||
children or descendants of a tree. Tree positions are defined as
|
||||
follows:
|
||||
|
||||
- The tree position *i* specifies a Tree's *i*\ th child.
|
||||
- The tree position ``()`` specifies the Tree itself.
|
||||
- If *p* is the tree position of descendant *d*, then
|
||||
*p+i* specifies the *i*\ th child of *d*.
|
||||
|
||||
I.e., every tree position is either a single index *i*,
|
||||
specifying ``tree[i]``; or a sequence *i1, i2, ..., iN*,
|
||||
specifying ``tree[i1][i2]...[iN]``.
|
||||
|
||||
Construct a new tree. This constructor can be called in one
|
||||
of two ways:
|
||||
|
||||
- ``Tree(label, children)`` constructs a new tree with the
|
||||
specified label and list of children.
|
||||
|
||||
- ``Tree.fromstring(s)`` constructs a new tree by parsing the string ``s``.
|
||||
"""
|
||||
|
||||
def __init__(self, node, children=None):
|
||||
if children is None:
|
||||
raise TypeError(
|
||||
"%s: Expected a node value and child list " % type(self).__name__
|
||||
)
|
||||
elif isinstance(children, str):
|
||||
raise TypeError(
|
||||
"%s() argument 2 should be a list, not a "
|
||||
"string" % type(self).__name__
|
||||
)
|
||||
else:
|
||||
list.__init__(self, children)
|
||||
self._label = node
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Comparison operators
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.__class__ is other.__class__ and (self._label, list(self)) == (
|
||||
other._label,
|
||||
list(other),
|
||||
)
|
||||
|
||||
def __lt__(self, other):
|
||||
if not isinstance(other, Tree):
|
||||
# raise_unorderable_types("<", self, other)
|
||||
# Sometimes children can be pure strings,
|
||||
# so we need to be able to compare with non-trees:
|
||||
return self.__class__.__name__ < other.__class__.__name__
|
||||
elif self.__class__ is other.__class__:
|
||||
return (self._label, list(self)) < (other._label, list(other))
|
||||
else:
|
||||
return self.__class__.__name__ < other.__class__.__name__
|
||||
|
||||
# @total_ordering doesn't work here, since the class inherits from a builtin class
|
||||
__ne__ = lambda self, other: not self == other
|
||||
__gt__ = lambda self, other: not (self < other or self == other)
|
||||
__le__ = lambda self, other: self < other or self == other
|
||||
__ge__ = lambda self, other: not self < other
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Disabled list operations
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
def __mul__(self, v):
|
||||
raise TypeError("Tree does not support multiplication")
|
||||
|
||||
def __rmul__(self, v):
|
||||
raise TypeError("Tree does not support multiplication")
|
||||
|
||||
def __add__(self, v):
|
||||
raise TypeError("Tree does not support addition")
|
||||
|
||||
def __radd__(self, v):
|
||||
raise TypeError("Tree does not support addition")
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Indexing (with support for tree positions)
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
def __getitem__(self, index):
|
||||
if isinstance(index, (int, slice)):
|
||||
return list.__getitem__(self, index)
|
||||
elif isinstance(index, (list, tuple)):
|
||||
if len(index) == 0:
|
||||
return self
|
||||
elif len(index) == 1:
|
||||
return self[index[0]]
|
||||
else:
|
||||
return self[index[0]][index[1:]]
|
||||
else:
|
||||
raise TypeError(
|
||||
"%s indices must be integers, not %s"
|
||||
% (type(self).__name__, type(index).__name__)
|
||||
)
|
||||
|
||||
def __setitem__(self, index, value):
|
||||
if isinstance(index, (int, slice)):
|
||||
return list.__setitem__(self, index, value)
|
||||
elif isinstance(index, (list, tuple)):
|
||||
if len(index) == 0:
|
||||
raise IndexError("The tree position () may not be " "assigned to.")
|
||||
elif len(index) == 1:
|
||||
self[index[0]] = value
|
||||
else:
|
||||
self[index[0]][index[1:]] = value
|
||||
else:
|
||||
raise TypeError(
|
||||
"%s indices must be integers, not %s"
|
||||
% (type(self).__name__, type(index).__name__)
|
||||
)
|
||||
|
||||
def __delitem__(self, index):
|
||||
if isinstance(index, (int, slice)):
|
||||
return list.__delitem__(self, index)
|
||||
elif isinstance(index, (list, tuple)):
|
||||
if len(index) == 0:
|
||||
raise IndexError("The tree position () may not be deleted.")
|
||||
elif len(index) == 1:
|
||||
del self[index[0]]
|
||||
else:
|
||||
del self[index[0]][index[1:]]
|
||||
else:
|
||||
raise TypeError(
|
||||
"%s indices must be integers, not %s"
|
||||
% (type(self).__name__, type(index).__name__)
|
||||
)
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Basic tree operations
|
||||
# ////////////////////////////////////////////////////////////
|
||||
@deprecated("Use label() instead")
|
||||
def _get_node(self):
|
||||
"""Outdated method to access the node value; use the label() method instead."""
|
||||
|
||||
@deprecated("Use set_label() instead")
|
||||
def _set_node(self, value):
|
||||
"""Outdated method to set the node value; use the set_label() method instead."""
|
||||
|
||||
node = property(_get_node, _set_node)
|
||||
|
||||
def label(self):
|
||||
"""
|
||||
Return the node label of the tree.
|
||||
|
||||
>>> t = Tree.fromstring('(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))')
|
||||
>>> t.label()
|
||||
'S'
|
||||
|
||||
:return: the node label (typically a string)
|
||||
:rtype: any
|
||||
"""
|
||||
return self._label
|
||||
|
||||
def set_label(self, label):
|
||||
"""
|
||||
Set the node label of the tree.
|
||||
|
||||
>>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
|
||||
>>> t.set_label("T")
|
||||
>>> print(t)
|
||||
(T (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))
|
||||
|
||||
:param label: the node label (typically a string)
|
||||
:type label: any
|
||||
"""
|
||||
self._label = label
|
||||
|
||||
def leaves(self):
|
||||
"""
|
||||
Return the leaves of the tree.
|
||||
|
||||
>>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
|
||||
>>> t.leaves()
|
||||
['the', 'dog', 'chased', 'the', 'cat']
|
||||
|
||||
:return: a list containing this tree's leaves.
|
||||
The order reflects the order of the
|
||||
leaves in the tree's hierarchical structure.
|
||||
:rtype: list
|
||||
"""
|
||||
leaves = []
|
||||
for child in self:
|
||||
if isinstance(child, Tree):
|
||||
leaves.extend(child.leaves())
|
||||
else:
|
||||
leaves.append(child)
|
||||
return leaves
|
||||
|
||||
def flatten(self):
|
||||
"""
|
||||
Return a flat version of the tree, with all non-root non-terminals removed.
|
||||
|
||||
>>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
|
||||
>>> print(t.flatten())
|
||||
(S the dog chased the cat)
|
||||
|
||||
:return: a tree consisting of this tree's root connected directly to
|
||||
its leaves, omitting all intervening non-terminal nodes.
|
||||
:rtype: Tree
|
||||
"""
|
||||
return Tree(self.label(), self.leaves())
|
||||
|
||||
def height(self):
|
||||
"""
|
||||
Return the height of the tree.
|
||||
|
||||
>>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
|
||||
>>> t.height()
|
||||
5
|
||||
>>> print(t[0,0])
|
||||
(D the)
|
||||
>>> t[0,0].height()
|
||||
2
|
||||
|
||||
:return: The height of this tree. The height of a tree
|
||||
containing no children is 1; the height of a tree
|
||||
containing only leaves is 2; and the height of any other
|
||||
tree is one plus the maximum of its children's
|
||||
heights.
|
||||
:rtype: int
|
||||
"""
|
||||
max_child_height = 0
|
||||
for child in self:
|
||||
if isinstance(child, Tree):
|
||||
max_child_height = max(max_child_height, child.height())
|
||||
else:
|
||||
max_child_height = max(max_child_height, 1)
|
||||
return 1 + max_child_height
|
||||
|
||||
def treepositions(self, order="preorder"):
|
||||
"""
|
||||
>>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
|
||||
>>> t.treepositions() # doctest: +ELLIPSIS
|
||||
[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), ...]
|
||||
>>> for pos in t.treepositions('leaves'):
|
||||
... t[pos] = t[pos][::-1].upper()
|
||||
>>> print(t)
|
||||
(S (NP (D EHT) (N GOD)) (VP (V DESAHC) (NP (D EHT) (N TAC))))
|
||||
|
||||
:param order: One of: ``preorder``, ``postorder``, ``bothorder``,
|
||||
``leaves``.
|
||||
"""
|
||||
positions = []
|
||||
if order in ("preorder", "bothorder"):
|
||||
positions.append(())
|
||||
for i, child in enumerate(self):
|
||||
if isinstance(child, Tree):
|
||||
childpos = child.treepositions(order)
|
||||
positions.extend((i,) + p for p in childpos)
|
||||
else:
|
||||
positions.append((i,))
|
||||
if order in ("postorder", "bothorder"):
|
||||
positions.append(())
|
||||
return positions
|
||||
|
||||
def subtrees(self, filter=None):
|
||||
"""
|
||||
Generate all the subtrees of this tree, optionally restricted
|
||||
to trees matching the filter function.
|
||||
|
||||
>>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
|
||||
>>> for s in t.subtrees(lambda t: t.height() == 2):
|
||||
... print(s)
|
||||
(D the)
|
||||
(N dog)
|
||||
(V chased)
|
||||
(D the)
|
||||
(N cat)
|
||||
|
||||
:type filter: function
|
||||
:param filter: the function to filter all local trees
|
||||
"""
|
||||
if not filter or filter(self):
|
||||
yield self
|
||||
for child in self:
|
||||
if isinstance(child, Tree):
|
||||
yield from child.subtrees(filter)
|
||||
|
||||
def productions(self):
|
||||
"""
|
||||
Generate the productions that correspond to the non-terminal nodes of the tree.
|
||||
For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
|
||||
form P -> C1 C2 ... Cn.
|
||||
|
||||
>>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
|
||||
>>> t.productions() # doctest: +NORMALIZE_WHITESPACE
|
||||
[S -> NP VP, NP -> D N, D -> 'the', N -> 'dog', VP -> V NP, V -> 'chased',
|
||||
NP -> D N, D -> 'the', N -> 'cat']
|
||||
|
||||
:rtype: list(Production)
|
||||
"""
|
||||
|
||||
if not isinstance(self._label, str):
|
||||
raise TypeError(
|
||||
"Productions can only be generated from trees having node labels that are strings"
|
||||
)
|
||||
|
||||
prods = [Production(Nonterminal(self._label), _child_names(self))]
|
||||
for child in self:
|
||||
if isinstance(child, Tree):
|
||||
prods += child.productions()
|
||||
return prods
|
||||
|
||||
def pos(self):
|
||||
"""
|
||||
Return a sequence of pos-tagged words extracted from the tree.
|
||||
|
||||
>>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
|
||||
>>> t.pos()
|
||||
[('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')]
|
||||
|
||||
:return: a list of tuples containing leaves and pre-terminals (part-of-speech tags).
|
||||
The order reflects the order of the leaves in the tree's hierarchical structure.
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
pos = []
|
||||
for child in self:
|
||||
if isinstance(child, Tree):
|
||||
pos.extend(child.pos())
|
||||
else:
|
||||
pos.append((child, self._label))
|
||||
return pos
|
||||
|
||||
def leaf_treeposition(self, index):
|
||||
"""
|
||||
:return: The tree position of the ``index``-th leaf in this
|
||||
tree. I.e., if ``tp=self.leaf_treeposition(i)``, then
|
||||
``self[tp]==self.leaves()[i]``.
|
||||
|
||||
:raise IndexError: If this tree contains fewer than ``index+1``
|
||||
leaves, or if ``index<0``.
|
||||
"""
|
||||
if index < 0:
|
||||
raise IndexError("index must be non-negative")
|
||||
|
||||
stack = [(self, ())]
|
||||
while stack:
|
||||
value, treepos = stack.pop()
|
||||
if not isinstance(value, Tree):
|
||||
if index == 0:
|
||||
return treepos
|
||||
else:
|
||||
index -= 1
|
||||
else:
|
||||
for i in range(len(value) - 1, -1, -1):
|
||||
stack.append((value[i], treepos + (i,)))
|
||||
|
||||
raise IndexError("index must be less than or equal to len(self)")
|
||||
|
||||
def treeposition_spanning_leaves(self, start, end):
|
||||
"""
|
||||
:return: The tree position of the lowest descendant of this
|
||||
tree that dominates ``self.leaves()[start:end]``.
|
||||
:raise ValueError: if ``end <= start``
|
||||
"""
|
||||
if end <= start:
|
||||
raise ValueError("end must be greater than start")
|
||||
# Find the tree positions of the start & end leaves, and
|
||||
# take the longest common subsequence.
|
||||
start_treepos = self.leaf_treeposition(start)
|
||||
end_treepos = self.leaf_treeposition(end - 1)
|
||||
# Find the first index where they mismatch:
|
||||
for i in range(len(start_treepos)):
|
||||
if i == len(end_treepos) or start_treepos[i] != end_treepos[i]:
|
||||
return start_treepos[:i]
|
||||
return start_treepos
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Transforms
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
def chomsky_normal_form(
|
||||
self,
|
||||
factor="right",
|
||||
horzMarkov=None,
|
||||
vertMarkov=0,
|
||||
childChar="|",
|
||||
parentChar="^",
|
||||
):
|
||||
"""
|
||||
This method can modify a tree in three ways:
|
||||
|
||||
1. Convert a tree into its Chomsky Normal Form (CNF)
|
||||
equivalent -- Every subtree has either two non-terminals
|
||||
or one terminal as its children. This process requires
|
||||
the creation of more"artificial" non-terminal nodes.
|
||||
2. Markov (vertical) smoothing of children in new artificial
|
||||
nodes
|
||||
3. Horizontal (parent) annotation of nodes
|
||||
|
||||
:param factor: Right or left factoring method (default = "right")
|
||||
:type factor: str = [left|right]
|
||||
:param horzMarkov: Markov order for sibling smoothing in artificial nodes (None (default) = include all siblings)
|
||||
:type horzMarkov: int | None
|
||||
:param vertMarkov: Markov order for parent smoothing (0 (default) = no vertical annotation)
|
||||
:type vertMarkov: int | None
|
||||
:param childChar: A string used in construction of the artificial nodes, separating the head of the
|
||||
original subtree from the child nodes that have yet to be expanded (default = "|")
|
||||
:type childChar: str
|
||||
:param parentChar: A string used to separate the node representation from its vertical annotation
|
||||
:type parentChar: str
|
||||
"""
|
||||
from nltk.tree.transforms import chomsky_normal_form
|
||||
|
||||
chomsky_normal_form(self, factor, horzMarkov, vertMarkov, childChar, parentChar)
|
||||
|
||||
def un_chomsky_normal_form(
|
||||
self, expandUnary=True, childChar="|", parentChar="^", unaryChar="+"
|
||||
):
|
||||
"""
|
||||
This method modifies the tree in three ways:
|
||||
|
||||
1. Transforms a tree in Chomsky Normal Form back to its
|
||||
original structure (branching greater than two)
|
||||
2. Removes any parent annotation (if it exists)
|
||||
3. (optional) expands unary subtrees (if previously
|
||||
collapsed with collapseUnary(...) )
|
||||
|
||||
:param expandUnary: Flag to expand unary or not (default = True)
|
||||
:type expandUnary: bool
|
||||
:param childChar: A string separating the head node from its children in an artificial node (default = "|")
|
||||
:type childChar: str
|
||||
:param parentChar: A string separating the node label from its parent annotation (default = "^")
|
||||
:type parentChar: str
|
||||
:param unaryChar: A string joining two non-terminals in a unary production (default = "+")
|
||||
:type unaryChar: str
|
||||
"""
|
||||
from nltk.tree.transforms import un_chomsky_normal_form
|
||||
|
||||
un_chomsky_normal_form(self, expandUnary, childChar, parentChar, unaryChar)
|
||||
|
||||
def collapse_unary(self, collapsePOS=False, collapseRoot=False, joinChar="+"):
|
||||
"""
|
||||
Collapse subtrees with a single child (ie. unary productions)
|
||||
into a new non-terminal (Tree node) joined by 'joinChar'.
|
||||
This is useful when working with algorithms that do not allow
|
||||
unary productions, and completely removing the unary productions
|
||||
would require loss of useful information. The Tree is modified
|
||||
directly (since it is passed by reference) and no value is returned.
|
||||
|
||||
:param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie.
|
||||
Part-of-Speech tags) since they are always unary productions
|
||||
:type collapsePOS: bool
|
||||
:param collapseRoot: 'False' (default) will not modify the root production
|
||||
if it is unary. For the Penn WSJ treebank corpus, this corresponds
|
||||
to the TOP -> productions.
|
||||
:type collapseRoot: bool
|
||||
:param joinChar: A string used to connect collapsed node values (default = "+")
|
||||
:type joinChar: str
|
||||
"""
|
||||
from nltk.tree.transforms import collapse_unary
|
||||
|
||||
collapse_unary(self, collapsePOS, collapseRoot, joinChar)
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Convert, copy
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
@classmethod
|
||||
def convert(cls, tree):
|
||||
"""
|
||||
Convert a tree between different subtypes of Tree. ``cls`` determines
|
||||
which class will be used to encode the new tree.
|
||||
|
||||
:type tree: Tree
|
||||
:param tree: The tree that should be converted.
|
||||
:return: The new Tree.
|
||||
"""
|
||||
if isinstance(tree, Tree):
|
||||
children = [cls.convert(child) for child in tree]
|
||||
return cls(tree._label, children)
|
||||
else:
|
||||
return tree
|
||||
|
||||
def __copy__(self):
|
||||
return self.copy()
|
||||
|
||||
def __deepcopy__(self, memo):
|
||||
return self.copy(deep=True)
|
||||
|
||||
def copy(self, deep=False):
|
||||
if not deep:
|
||||
return type(self)(self._label, self)
|
||||
else:
|
||||
return type(self).convert(self)
|
||||
|
||||
def _frozen_class(self):
|
||||
from nltk.tree.immutable import ImmutableTree
|
||||
|
||||
return ImmutableTree
|
||||
|
||||
def freeze(self, leaf_freezer=None):
|
||||
frozen_class = self._frozen_class()
|
||||
if leaf_freezer is None:
|
||||
newcopy = frozen_class.convert(self)
|
||||
else:
|
||||
newcopy = self.copy(deep=True)
|
||||
for pos in newcopy.treepositions("leaves"):
|
||||
newcopy[pos] = leaf_freezer(newcopy[pos])
|
||||
newcopy = frozen_class.convert(newcopy)
|
||||
hash(newcopy) # Make sure the leaves are hashable.
|
||||
return newcopy
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Parsing
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
@classmethod
|
||||
def fromstring(
|
||||
cls,
|
||||
s,
|
||||
brackets="()",
|
||||
read_node=None,
|
||||
read_leaf=None,
|
||||
node_pattern=None,
|
||||
leaf_pattern=None,
|
||||
remove_empty_top_bracketing=False,
|
||||
):
|
||||
"""
|
||||
Read a bracketed tree string and return the resulting tree.
|
||||
Trees are represented as nested brackettings, such as::
|
||||
|
||||
(S (NP (NNP John)) (VP (V runs)))
|
||||
|
||||
:type s: str
|
||||
:param s: The string to read
|
||||
|
||||
:type brackets: str (length=2)
|
||||
:param brackets: The bracket characters used to mark the
|
||||
beginning and end of trees and subtrees.
|
||||
|
||||
:type read_node: function
|
||||
:type read_leaf: function
|
||||
:param read_node, read_leaf: If specified, these functions
|
||||
are applied to the substrings of ``s`` corresponding to
|
||||
nodes and leaves (respectively) to obtain the values for
|
||||
those nodes and leaves. They should have the following
|
||||
signature:
|
||||
|
||||
read_node(str) -> value
|
||||
|
||||
For example, these functions could be used to process nodes
|
||||
and leaves whose values should be some type other than
|
||||
string (such as ``FeatStruct``).
|
||||
Note that by default, node strings and leaf strings are
|
||||
delimited by whitespace and brackets; to override this
|
||||
default, use the ``node_pattern`` and ``leaf_pattern``
|
||||
arguments.
|
||||
|
||||
:type node_pattern: str
|
||||
:type leaf_pattern: str
|
||||
:param node_pattern, leaf_pattern: Regular expression patterns
|
||||
used to find node and leaf substrings in ``s``. By
|
||||
default, both nodes patterns are defined to match any
|
||||
sequence of non-whitespace non-bracket characters.
|
||||
|
||||
:type remove_empty_top_bracketing: bool
|
||||
:param remove_empty_top_bracketing: If the resulting tree has
|
||||
an empty node label, and is length one, then return its
|
||||
single child instead. This is useful for treebank trees,
|
||||
which sometimes contain an extra level of bracketing.
|
||||
|
||||
:return: A tree corresponding to the string representation ``s``.
|
||||
If this class method is called using a subclass of Tree,
|
||||
then it will return a tree of that type.
|
||||
:rtype: Tree
|
||||
"""
|
||||
if not isinstance(brackets, str) or len(brackets) != 2:
|
||||
raise TypeError("brackets must be a length-2 string")
|
||||
if re.search(r"\s", brackets):
|
||||
raise TypeError("whitespace brackets not allowed")
|
||||
# Construct a regexp that will tokenize the string.
|
||||
open_b, close_b = brackets
|
||||
open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b))
|
||||
if node_pattern is None:
|
||||
node_pattern = rf"[^\s{open_pattern}{close_pattern}]+"
|
||||
if leaf_pattern is None:
|
||||
leaf_pattern = rf"[^\s{open_pattern}{close_pattern}]+"
|
||||
token_re = re.compile(
|
||||
r"%s\s*(%s)?|%s|(%s)"
|
||||
% (open_pattern, node_pattern, close_pattern, leaf_pattern)
|
||||
)
|
||||
# Walk through each token, updating a stack of trees.
|
||||
stack = [(None, [])] # list of (node, children) tuples
|
||||
for match in token_re.finditer(s):
|
||||
token = match.group()
|
||||
# Beginning of a tree/subtree
|
||||
if token[0] == open_b:
|
||||
if len(stack) == 1 and len(stack[0][1]) > 0:
|
||||
cls._parse_error(s, match, "end-of-string")
|
||||
label = token[1:].lstrip()
|
||||
if read_node is not None:
|
||||
label = read_node(label)
|
||||
stack.append((label, []))
|
||||
# End of a tree/subtree
|
||||
elif token == close_b:
|
||||
if len(stack) == 1:
|
||||
if len(stack[0][1]) == 0:
|
||||
cls._parse_error(s, match, open_b)
|
||||
else:
|
||||
cls._parse_error(s, match, "end-of-string")
|
||||
label, children = stack.pop()
|
||||
stack[-1][1].append(cls(label, children))
|
||||
# Leaf node
|
||||
else:
|
||||
if len(stack) == 1:
|
||||
cls._parse_error(s, match, open_b)
|
||||
if read_leaf is not None:
|
||||
token = read_leaf(token)
|
||||
stack[-1][1].append(token)
|
||||
|
||||
# check that we got exactly one complete tree.
|
||||
if len(stack) > 1:
|
||||
cls._parse_error(s, "end-of-string", close_b)
|
||||
elif len(stack[0][1]) == 0:
|
||||
cls._parse_error(s, "end-of-string", open_b)
|
||||
else:
|
||||
assert stack[0][0] is None
|
||||
assert len(stack[0][1]) == 1
|
||||
tree = stack[0][1][0]
|
||||
|
||||
# If the tree has an extra level with node='', then get rid of
|
||||
# it. E.g.: "((S (NP ...) (VP ...)))"
|
||||
if remove_empty_top_bracketing and tree._label == "" and len(tree) == 1:
|
||||
tree = tree[0]
|
||||
# return the tree.
|
||||
return tree
|
||||
|
||||
@classmethod
|
||||
def _parse_error(cls, s, match, expecting):
|
||||
"""
|
||||
Display a friendly error message when parsing a tree string fails.
|
||||
:param s: The string we're parsing.
|
||||
:param match: regexp match of the problem token.
|
||||
:param expecting: what we expected to see instead.
|
||||
"""
|
||||
# Construct a basic error message
|
||||
if match == "end-of-string":
|
||||
pos, token = len(s), "end-of-string"
|
||||
else:
|
||||
pos, token = match.start(), match.group()
|
||||
msg = "%s.read(): expected %r but got %r\n%sat index %d." % (
|
||||
cls.__name__,
|
||||
expecting,
|
||||
token,
|
||||
" " * 12,
|
||||
pos,
|
||||
)
|
||||
# Add a display showing the error token itsels:
|
||||
s = s.replace("\n", " ").replace("\t", " ")
|
||||
offset = pos
|
||||
if len(s) > pos + 10:
|
||||
s = s[: pos + 10] + "..."
|
||||
if pos > 10:
|
||||
s = "..." + s[pos - 10 :]
|
||||
offset = 13
|
||||
msg += '\n{}"{}"\n{}^'.format(" " * 16, s, " " * (17 + offset))
|
||||
raise ValueError(msg)
|
||||
|
||||
@classmethod
|
||||
def fromlist(cls, l):
|
||||
"""
|
||||
:type l: list
|
||||
:param l: a tree represented as nested lists
|
||||
|
||||
:return: A tree corresponding to the list representation ``l``.
|
||||
:rtype: Tree
|
||||
|
||||
Convert nested lists to a NLTK Tree
|
||||
"""
|
||||
if type(l) == list and len(l) > 0:
|
||||
label = repr(l[0])
|
||||
if len(l) > 1:
|
||||
return Tree(label, [cls.fromlist(child) for child in l[1:]])
|
||||
else:
|
||||
return label
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Visualization & String Representation
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
def draw(self):
|
||||
"""
|
||||
Open a new window containing a graphical diagram of this tree.
|
||||
"""
|
||||
from nltk.draw.tree import draw_trees
|
||||
|
||||
draw_trees(self)
|
||||
|
||||
def pretty_print(self, sentence=None, highlight=(), stream=None, **kwargs):
|
||||
"""
|
||||
Pretty-print this tree as ASCII or Unicode art.
|
||||
For explanation of the arguments, see the documentation for
|
||||
`nltk.tree.prettyprinter.TreePrettyPrinter`.
|
||||
"""
|
||||
from nltk.tree.prettyprinter import TreePrettyPrinter
|
||||
|
||||
print(TreePrettyPrinter(self, sentence, highlight).text(**kwargs), file=stream)
|
||||
|
||||
def __repr__(self):
|
||||
childstr = ", ".join(repr(c) for c in self)
|
||||
return "{}({}, [{}])".format(
|
||||
type(self).__name__,
|
||||
repr(self._label),
|
||||
childstr,
|
||||
)
|
||||
|
||||
def _repr_svg_(self):
|
||||
from svgling import draw_tree
|
||||
|
||||
return draw_tree(self)._repr_svg_()
|
||||
|
||||
def __str__(self):
|
||||
return self.pformat()
|
||||
|
||||
def pprint(self, **kwargs):
|
||||
"""
|
||||
Print a string representation of this Tree to 'stream'
|
||||
"""
|
||||
|
||||
if "stream" in kwargs:
|
||||
stream = kwargs["stream"]
|
||||
del kwargs["stream"]
|
||||
else:
|
||||
stream = None
|
||||
print(self.pformat(**kwargs), file=stream)
|
||||
|
||||
def pformat(self, margin=70, indent=0, nodesep="", parens="()", quotes=False):
|
||||
"""
|
||||
:return: A pretty-printed string representation of this tree.
|
||||
:rtype: str
|
||||
:param margin: The right margin at which to do line-wrapping.
|
||||
:type margin: int
|
||||
:param indent: The indentation level at which printing
|
||||
begins. This number is used to decide how far to indent
|
||||
subsequent lines.
|
||||
:type indent: int
|
||||
:param nodesep: A string that is used to separate the node
|
||||
from the children. E.g., the default value ``':'`` gives
|
||||
trees like ``(S: (NP: I) (VP: (V: saw) (NP: it)))``.
|
||||
"""
|
||||
|
||||
# Try writing it on one line.
|
||||
s = self._pformat_flat(nodesep, parens, quotes)
|
||||
if len(s) + indent < margin:
|
||||
return s
|
||||
|
||||
# If it doesn't fit on one line, then write it on multi-lines.
|
||||
if isinstance(self._label, str):
|
||||
s = f"{parens[0]}{self._label}{nodesep}"
|
||||
else:
|
||||
s = f"{parens[0]}{repr(self._label)}{nodesep}"
|
||||
for child in self:
|
||||
if isinstance(child, Tree):
|
||||
s += (
|
||||
"\n"
|
||||
+ " " * (indent + 2)
|
||||
+ child.pformat(margin, indent + 2, nodesep, parens, quotes)
|
||||
)
|
||||
elif isinstance(child, tuple):
|
||||
s += "\n" + " " * (indent + 2) + "/".join(child)
|
||||
elif isinstance(child, str) and not quotes:
|
||||
s += "\n" + " " * (indent + 2) + "%s" % child
|
||||
else:
|
||||
s += "\n" + " " * (indent + 2) + repr(child)
|
||||
return s + parens[1]
|
||||
|
||||
def pformat_latex_qtree(self):
|
||||
r"""
|
||||
Returns a representation of the tree compatible with the
|
||||
LaTeX qtree package. This consists of the string ``\Tree``
|
||||
followed by the tree represented in bracketed notation.
|
||||
|
||||
For example, the following result was generated from a parse tree of
|
||||
the sentence ``The announcement astounded us``::
|
||||
|
||||
\Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ]
|
||||
[.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ]
|
||||
|
||||
See https://www.ling.upenn.edu/advice/latex.html for the LaTeX
|
||||
style file for the qtree package.
|
||||
|
||||
:return: A latex qtree representation of this tree.
|
||||
:rtype: str
|
||||
"""
|
||||
reserved_chars = re.compile(r"([#\$%&~_\{\}])")
|
||||
|
||||
pformat = self.pformat(indent=6, nodesep="", parens=("[.", " ]"))
|
||||
return r"\Tree " + re.sub(reserved_chars, r"\\\1", pformat)
|
||||
|
||||
def _pformat_flat(self, nodesep, parens, quotes):
|
||||
childstrs = []
|
||||
for child in self:
|
||||
if isinstance(child, Tree):
|
||||
childstrs.append(child._pformat_flat(nodesep, parens, quotes))
|
||||
elif isinstance(child, tuple):
|
||||
childstrs.append("/".join(child))
|
||||
elif isinstance(child, str) and not quotes:
|
||||
childstrs.append("%s" % child)
|
||||
else:
|
||||
childstrs.append(repr(child))
|
||||
if isinstance(self._label, str):
|
||||
return "{}{}{} {}{}".format(
|
||||
parens[0],
|
||||
self._label,
|
||||
nodesep,
|
||||
" ".join(childstrs),
|
||||
parens[1],
|
||||
)
|
||||
else:
|
||||
return "{}{}{} {}{}".format(
|
||||
parens[0],
|
||||
repr(self._label),
|
||||
nodesep,
|
||||
" ".join(childstrs),
|
||||
parens[1],
|
||||
)
|
||||
|
||||
|
||||
def _child_names(tree):
|
||||
names = []
|
||||
for child in tree:
|
||||
if isinstance(child, Tree):
|
||||
names.append(Nonterminal(child._label))
|
||||
else:
|
||||
names.append(child)
|
||||
return names
|
||||
|
||||
|
||||
######################################################################
|
||||
## Demonstration
|
||||
######################################################################
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
A demonstration showing how Trees and Trees can be
|
||||
used. This demonstration creates a Tree, and loads a
|
||||
Tree from the Treebank corpus,
|
||||
and shows the results of calling several of their methods.
|
||||
"""
|
||||
|
||||
from nltk import ProbabilisticTree, Tree
|
||||
|
||||
# Demonstrate tree parsing.
|
||||
s = "(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))"
|
||||
t = Tree.fromstring(s)
|
||||
print("Convert bracketed string into tree:")
|
||||
print(t)
|
||||
print(t.__repr__())
|
||||
|
||||
print("Display tree properties:")
|
||||
print(t.label()) # tree's constituent type
|
||||
print(t[0]) # tree's first child
|
||||
print(t[1]) # tree's second child
|
||||
print(t.height())
|
||||
print(t.leaves())
|
||||
print(t[1])
|
||||
print(t[1, 1])
|
||||
print(t[1, 1, 0])
|
||||
|
||||
# Demonstrate tree modification.
|
||||
the_cat = t[0]
|
||||
the_cat.insert(1, Tree.fromstring("(JJ big)"))
|
||||
print("Tree modification:")
|
||||
print(t)
|
||||
t[1, 1, 1] = Tree.fromstring("(NN cake)")
|
||||
print(t)
|
||||
print()
|
||||
|
||||
# Tree transforms
|
||||
print("Collapse unary:")
|
||||
t.collapse_unary()
|
||||
print(t)
|
||||
print("Chomsky normal form:")
|
||||
t.chomsky_normal_form()
|
||||
print(t)
|
||||
print()
|
||||
|
||||
# Demonstrate probabilistic trees.
|
||||
pt = ProbabilisticTree("x", ["y", "z"], prob=0.5)
|
||||
print("Probabilistic Tree:")
|
||||
print(pt)
|
||||
print()
|
||||
|
||||
# Demonstrate parsing of treebank output format.
|
||||
t = Tree.fromstring(t.pformat())
|
||||
print("Convert tree to bracketed string and back again:")
|
||||
print(t)
|
||||
print()
|
||||
|
||||
# Demonstrate LaTeX output
|
||||
print("LaTeX output:")
|
||||
print(t.pformat_latex_qtree())
|
||||
print()
|
||||
|
||||
# Demonstrate Productions
|
||||
print("Production output:")
|
||||
print(t.productions())
|
||||
print()
|
||||
|
||||
# Demonstrate tree nodes containing objects other than strings
|
||||
t.set_label(("test", 3))
|
||||
print(t)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"Tree",
|
||||
]
|
||||
Reference in New Issue
Block a user