Initial commit
This commit is contained in:
102
backend/venv/Lib/site-packages/nltk/parse/__init__.py
Normal file
102
backend/venv/Lib/site-packages/nltk/parse/__init__.py
Normal file
@@ -0,0 +1,102 @@
|
||||
# Natural Language Toolkit: Parsers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
"""
|
||||
NLTK Parsers
|
||||
|
||||
Classes and interfaces for producing tree structures that represent
|
||||
the internal organization of a text. This task is known as "parsing"
|
||||
the text, and the resulting tree structures are called the text's
|
||||
"parses". Typically, the text is a single sentence, and the tree
|
||||
structure represents the syntactic structure of the sentence.
|
||||
However, parsers can also be used in other domains. For example,
|
||||
parsers can be used to derive the morphological structure of the
|
||||
morphemes that make up a word, or to derive the discourse structure
|
||||
for a set of utterances.
|
||||
|
||||
Sometimes, a single piece of text can be represented by more than one
|
||||
tree structure. Texts represented by more than one tree structure are
|
||||
called "ambiguous" texts. Note that there are actually two ways in
|
||||
which a text can be ambiguous:
|
||||
|
||||
- The text has multiple correct parses.
|
||||
- There is not enough information to decide which of several
|
||||
candidate parses is correct.
|
||||
|
||||
However, the parser module does *not* distinguish these two types of
|
||||
ambiguity.
|
||||
|
||||
The parser module defines ``ParserI``, a standard interface for parsing
|
||||
texts; and two simple implementations of that interface,
|
||||
``ShiftReduceParser`` and ``RecursiveDescentParser``. It also contains
|
||||
three sub-modules for specialized kinds of parsing:
|
||||
|
||||
- ``nltk.parser.chart`` defines chart parsing, which uses dynamic
|
||||
programming to efficiently parse texts.
|
||||
- ``nltk.parser.probabilistic`` defines probabilistic parsing, which
|
||||
associates a probability with each parse.
|
||||
"""
|
||||
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.parse.bllip import BllipParser
|
||||
from nltk.parse.chart import (
|
||||
BottomUpChartParser,
|
||||
BottomUpLeftCornerChartParser,
|
||||
ChartParser,
|
||||
LeftCornerChartParser,
|
||||
SteppingChartParser,
|
||||
TopDownChartParser,
|
||||
)
|
||||
from nltk.parse.corenlp import CoreNLPDependencyParser, CoreNLPParser
|
||||
from nltk.parse.dependencygraph import DependencyGraph
|
||||
from nltk.parse.earleychart import (
|
||||
EarleyChartParser,
|
||||
FeatureEarleyChartParser,
|
||||
FeatureIncrementalBottomUpChartParser,
|
||||
FeatureIncrementalBottomUpLeftCornerChartParser,
|
||||
FeatureIncrementalChartParser,
|
||||
FeatureIncrementalTopDownChartParser,
|
||||
IncrementalBottomUpChartParser,
|
||||
IncrementalBottomUpLeftCornerChartParser,
|
||||
IncrementalChartParser,
|
||||
IncrementalLeftCornerChartParser,
|
||||
IncrementalTopDownChartParser,
|
||||
)
|
||||
from nltk.parse.evaluate import DependencyEvaluator
|
||||
from nltk.parse.featurechart import (
|
||||
FeatureBottomUpChartParser,
|
||||
FeatureBottomUpLeftCornerChartParser,
|
||||
FeatureChartParser,
|
||||
FeatureTopDownChartParser,
|
||||
)
|
||||
from nltk.parse.malt import MaltParser
|
||||
from nltk.parse.nonprojectivedependencyparser import (
|
||||
NaiveBayesDependencyScorer,
|
||||
NonprojectiveDependencyParser,
|
||||
ProbabilisticNonprojectiveParser,
|
||||
)
|
||||
from nltk.parse.pchart import (
|
||||
BottomUpProbabilisticChartParser,
|
||||
InsideChartParser,
|
||||
LongestChartParser,
|
||||
RandomChartParser,
|
||||
UnsortedChartParser,
|
||||
)
|
||||
from nltk.parse.projectivedependencyparser import (
|
||||
ProbabilisticProjectiveDependencyParser,
|
||||
ProjectiveDependencyParser,
|
||||
)
|
||||
from nltk.parse.recursivedescent import (
|
||||
RecursiveDescentParser,
|
||||
SteppingRecursiveDescentParser,
|
||||
)
|
||||
from nltk.parse.shiftreduce import ShiftReduceParser, SteppingShiftReduceParser
|
||||
from nltk.parse.transitionparser import TransitionParser
|
||||
from nltk.parse.util import TestGrammar, extract_test_sentences, load_parser
|
||||
from nltk.parse.viterbi import ViterbiParser
|
||||
72
backend/venv/Lib/site-packages/nltk/parse/api.py
Normal file
72
backend/venv/Lib/site-packages/nltk/parse/api.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# Natural Language Toolkit: Parser API
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
import itertools
|
||||
|
||||
from nltk.internals import overridden
|
||||
|
||||
|
||||
class ParserI:
|
||||
"""
|
||||
A processing class for deriving trees that represent possible
|
||||
structures for a sequence of tokens. These tree structures are
|
||||
known as "parses". Typically, parsers are used to derive syntax
|
||||
trees for sentences. But parsers can also be used to derive other
|
||||
kinds of tree structure, such as morphological trees and discourse
|
||||
structures.
|
||||
|
||||
Subclasses must define:
|
||||
- at least one of: ``parse()``, ``parse_sents()``.
|
||||
|
||||
Subclasses may define:
|
||||
- ``grammar()``
|
||||
"""
|
||||
|
||||
def grammar(self):
|
||||
"""
|
||||
:return: The grammar used by this parser.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def parse(self, sent, *args, **kwargs):
|
||||
"""
|
||||
:return: An iterator that generates parse trees for the sentence.
|
||||
When possible this list is sorted from most likely to least likely.
|
||||
|
||||
:param sent: The sentence to be parsed
|
||||
:type sent: list(str)
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
if overridden(self.parse_sents):
|
||||
return next(self.parse_sents([sent], *args, **kwargs))
|
||||
elif overridden(self.parse_one):
|
||||
return (
|
||||
tree
|
||||
for tree in [self.parse_one(sent, *args, **kwargs)]
|
||||
if tree is not None
|
||||
)
|
||||
elif overridden(self.parse_all):
|
||||
return iter(self.parse_all(sent, *args, **kwargs))
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def parse_sents(self, sents, *args, **kwargs):
|
||||
"""
|
||||
Apply ``self.parse()`` to each element of ``sents``.
|
||||
:rtype: iter(iter(Tree))
|
||||
"""
|
||||
return (self.parse(sent, *args, **kwargs) for sent in sents)
|
||||
|
||||
def parse_all(self, sent, *args, **kwargs):
|
||||
""":rtype: list(Tree)"""
|
||||
return list(self.parse(sent, *args, **kwargs))
|
||||
|
||||
def parse_one(self, sent, *args, **kwargs):
|
||||
""":rtype: Tree or None"""
|
||||
return next(self.parse(sent, *args, **kwargs), None)
|
||||
299
backend/venv/Lib/site-packages/nltk/parse/bllip.py
Normal file
299
backend/venv/Lib/site-packages/nltk/parse/bllip.py
Normal file
@@ -0,0 +1,299 @@
|
||||
# Natural Language Toolkit: Interface to BLLIP Parser
|
||||
#
|
||||
# Author: David McClosky <dmcc@bigasterisk.com>
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.tree import Tree
|
||||
|
||||
"""
|
||||
Interface for parsing with BLLIP Parser. Requires the Python
|
||||
bllipparser module. BllipParser objects can be constructed with the
|
||||
``BllipParser.from_unified_model_dir`` class method or manually using the
|
||||
``BllipParser`` constructor. The former is generally easier if you have
|
||||
a BLLIP Parser unified model directory -- a basic model can be obtained
|
||||
from NLTK's downloader. More unified parsing models can be obtained with
|
||||
BLLIP Parser's ModelFetcher (run ``python -m bllipparser.ModelFetcher``
|
||||
or see docs for ``bllipparser.ModelFetcher.download_and_install_model``).
|
||||
|
||||
Basic usage::
|
||||
|
||||
# download and install a basic unified parsing model (Wall Street Journal)
|
||||
# sudo python -m nltk.downloader bllip_wsj_no_aux
|
||||
|
||||
>>> from nltk.data import find
|
||||
>>> model_dir = find('models/bllip_wsj_no_aux').path
|
||||
>>> bllip = BllipParser.from_unified_model_dir(model_dir)
|
||||
|
||||
# 1-best parsing
|
||||
>>> sentence1 = 'British left waffles on Falklands .'.split()
|
||||
>>> top_parse = bllip.parse_one(sentence1)
|
||||
>>> print(top_parse)
|
||||
(S1
|
||||
(S
|
||||
(NP (JJ British) (NN left))
|
||||
(VP (VBZ waffles) (PP (IN on) (NP (NNP Falklands))))
|
||||
(. .)))
|
||||
|
||||
# n-best parsing
|
||||
>>> sentence2 = 'Time flies'.split()
|
||||
>>> all_parses = bllip.parse_all(sentence2)
|
||||
>>> print(len(all_parses))
|
||||
50
|
||||
>>> print(all_parses[0])
|
||||
(S1 (S (NP (NNP Time)) (VP (VBZ flies))))
|
||||
|
||||
# incorporating external tagging constraints (None means unconstrained tag)
|
||||
>>> constrained1 = bllip.tagged_parse([('Time', 'VB'), ('flies', 'NNS')])
|
||||
>>> print(next(constrained1))
|
||||
(S1 (NP (VB Time) (NNS flies)))
|
||||
>>> constrained2 = bllip.tagged_parse([('Time', 'NN'), ('flies', None)])
|
||||
>>> print(next(constrained2))
|
||||
(S1 (NP (NN Time) (VBZ flies)))
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
- Charniak, Eugene. "A maximum-entropy-inspired parser." Proceedings of
|
||||
the 1st North American chapter of the Association for Computational
|
||||
Linguistics conference. Association for Computational Linguistics,
|
||||
2000.
|
||||
|
||||
- Charniak, Eugene, and Mark Johnson. "Coarse-to-fine n-best parsing
|
||||
and MaxEnt discriminative reranking." Proceedings of the 43rd Annual
|
||||
Meeting on Association for Computational Linguistics. Association
|
||||
for Computational Linguistics, 2005.
|
||||
|
||||
Known issues
|
||||
------------
|
||||
|
||||
Note that BLLIP Parser is not currently threadsafe. Since this module
|
||||
uses a SWIG interface, it is potentially unsafe to create multiple
|
||||
``BllipParser`` objects in the same process. BLLIP Parser currently
|
||||
has issues with non-ASCII text and will raise an error if given any.
|
||||
|
||||
See https://pypi.python.org/pypi/bllipparser/ for more information
|
||||
on BLLIP Parser's Python interface.
|
||||
"""
|
||||
|
||||
__all__ = ["BllipParser"]
|
||||
|
||||
# this block allows this module to be imported even if bllipparser isn't
|
||||
# available
|
||||
try:
|
||||
from bllipparser import RerankingParser
|
||||
from bllipparser.RerankingParser import get_unified_model_parameters
|
||||
|
||||
def _ensure_bllip_import_or_error():
|
||||
pass
|
||||
|
||||
except ImportError as ie:
|
||||
|
||||
def _ensure_bllip_import_or_error(ie=ie):
|
||||
raise ImportError("Couldn't import bllipparser module: %s" % ie)
|
||||
|
||||
|
||||
def _ensure_ascii(words):
|
||||
try:
|
||||
for i, word in enumerate(words):
|
||||
word.encode("ascii")
|
||||
except UnicodeEncodeError as e:
|
||||
raise ValueError(
|
||||
f"Token {i} ({word!r}) is non-ASCII. BLLIP Parser "
|
||||
"currently doesn't support non-ASCII inputs."
|
||||
) from e
|
||||
|
||||
|
||||
def _scored_parse_to_nltk_tree(scored_parse):
|
||||
return Tree.fromstring(str(scored_parse.ptb_parse))
|
||||
|
||||
|
||||
class BllipParser(ParserI):
|
||||
"""
|
||||
Interface for parsing with BLLIP Parser. BllipParser objects can be
|
||||
constructed with the ``BllipParser.from_unified_model_dir`` class
|
||||
method or manually using the ``BllipParser`` constructor.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
parser_model=None,
|
||||
reranker_features=None,
|
||||
reranker_weights=None,
|
||||
parser_options=None,
|
||||
reranker_options=None,
|
||||
):
|
||||
"""
|
||||
Load a BLLIP Parser model from scratch. You'll typically want to
|
||||
use the ``from_unified_model_dir()`` class method to construct
|
||||
this object.
|
||||
|
||||
:param parser_model: Path to parser model directory
|
||||
:type parser_model: str
|
||||
|
||||
:param reranker_features: Path the reranker model's features file
|
||||
:type reranker_features: str
|
||||
|
||||
:param reranker_weights: Path the reranker model's weights file
|
||||
:type reranker_weights: str
|
||||
|
||||
:param parser_options: optional dictionary of parser options, see
|
||||
``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
|
||||
for more information.
|
||||
:type parser_options: dict(str)
|
||||
|
||||
:param reranker_options: optional
|
||||
dictionary of reranker options, see
|
||||
``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
|
||||
for more information.
|
||||
:type reranker_options: dict(str)
|
||||
"""
|
||||
_ensure_bllip_import_or_error()
|
||||
|
||||
parser_options = parser_options or {}
|
||||
reranker_options = reranker_options or {}
|
||||
|
||||
self.rrp = RerankingParser()
|
||||
self.rrp.load_parser_model(parser_model, **parser_options)
|
||||
if reranker_features and reranker_weights:
|
||||
self.rrp.load_reranker_model(
|
||||
features_filename=reranker_features,
|
||||
weights_filename=reranker_weights,
|
||||
**reranker_options,
|
||||
)
|
||||
|
||||
def parse(self, sentence):
|
||||
"""
|
||||
Use BLLIP Parser to parse a sentence. Takes a sentence as a list
|
||||
of words; it will be automatically tagged with this BLLIP Parser
|
||||
instance's tagger.
|
||||
|
||||
:return: An iterator that generates parse trees for the sentence
|
||||
from most likely to least likely.
|
||||
|
||||
:param sentence: The sentence to be parsed
|
||||
:type sentence: list(str)
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
_ensure_ascii(sentence)
|
||||
nbest_list = self.rrp.parse(sentence)
|
||||
for scored_parse in nbest_list:
|
||||
yield _scored_parse_to_nltk_tree(scored_parse)
|
||||
|
||||
def tagged_parse(self, word_and_tag_pairs):
|
||||
"""
|
||||
Use BLLIP to parse a sentence. Takes a sentence as a list of
|
||||
(word, tag) tuples; the sentence must have already been tokenized
|
||||
and tagged. BLLIP will attempt to use the tags provided but may
|
||||
use others if it can't come up with a complete parse subject
|
||||
to those constraints. You may also specify a tag as ``None``
|
||||
to leave a token's tag unconstrained.
|
||||
|
||||
:return: An iterator that generates parse trees for the sentence
|
||||
from most likely to least likely.
|
||||
|
||||
:param sentence: Input sentence to parse as (word, tag) pairs
|
||||
:type sentence: list(tuple(str, str))
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
words = []
|
||||
tag_map = {}
|
||||
for i, (word, tag) in enumerate(word_and_tag_pairs):
|
||||
words.append(word)
|
||||
if tag is not None:
|
||||
tag_map[i] = tag
|
||||
|
||||
_ensure_ascii(words)
|
||||
nbest_list = self.rrp.parse_tagged(words, tag_map)
|
||||
for scored_parse in nbest_list:
|
||||
yield _scored_parse_to_nltk_tree(scored_parse)
|
||||
|
||||
@classmethod
|
||||
def from_unified_model_dir(
|
||||
cls, model_dir, parser_options=None, reranker_options=None
|
||||
):
|
||||
"""
|
||||
Create a ``BllipParser`` object from a unified parsing model
|
||||
directory. Unified parsing model directories are a standardized
|
||||
way of storing BLLIP parser and reranker models together on disk.
|
||||
See ``bllipparser.RerankingParser.get_unified_model_parameters()``
|
||||
for more information about unified model directories.
|
||||
|
||||
:return: A ``BllipParser`` object using the parser and reranker
|
||||
models in the model directory.
|
||||
|
||||
:param model_dir: Path to the unified model directory.
|
||||
:type model_dir: str
|
||||
:param parser_options: optional dictionary of parser options, see
|
||||
``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
|
||||
for more information.
|
||||
:type parser_options: dict(str)
|
||||
:param reranker_options: optional dictionary of reranker options, see
|
||||
``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
|
||||
for more information.
|
||||
:type reranker_options: dict(str)
|
||||
:rtype: BllipParser
|
||||
"""
|
||||
(
|
||||
parser_model_dir,
|
||||
reranker_features_filename,
|
||||
reranker_weights_filename,
|
||||
) = get_unified_model_parameters(model_dir)
|
||||
return cls(
|
||||
parser_model_dir,
|
||||
reranker_features_filename,
|
||||
reranker_weights_filename,
|
||||
parser_options,
|
||||
reranker_options,
|
||||
)
|
||||
|
||||
|
||||
def demo():
|
||||
"""This assumes the Python module bllipparser is installed."""
|
||||
|
||||
# download and install a basic unified parsing model (Wall Street Journal)
|
||||
# sudo python -m nltk.downloader bllip_wsj_no_aux
|
||||
|
||||
from nltk.data import find
|
||||
|
||||
model_dir = find("models/bllip_wsj_no_aux").path
|
||||
|
||||
print("Loading BLLIP Parsing models...")
|
||||
# the easiest way to get started is to use a unified model
|
||||
bllip = BllipParser.from_unified_model_dir(model_dir)
|
||||
print("Done.")
|
||||
|
||||
sentence1 = "British left waffles on Falklands .".split()
|
||||
sentence2 = "I saw the man with the telescope .".split()
|
||||
# this sentence is known to fail under the WSJ parsing model
|
||||
fail1 = "# ! ? : -".split()
|
||||
for sentence in (sentence1, sentence2, fail1):
|
||||
print("Sentence: %r" % " ".join(sentence))
|
||||
try:
|
||||
tree = next(bllip.parse(sentence))
|
||||
print(tree)
|
||||
except StopIteration:
|
||||
print("(parse failed)")
|
||||
|
||||
# n-best parsing demo
|
||||
for i, parse in enumerate(bllip.parse(sentence1)):
|
||||
print("parse %d:\n%s" % (i, parse))
|
||||
|
||||
# using external POS tag constraints
|
||||
print(
|
||||
"forcing 'tree' to be 'NN':",
|
||||
next(bllip.tagged_parse([("A", None), ("tree", "NN")])),
|
||||
)
|
||||
print(
|
||||
"forcing 'A' to be 'DT' and 'tree' to be 'NNP':",
|
||||
next(bllip.tagged_parse([("A", "DT"), ("tree", "NNP")])),
|
||||
)
|
||||
# constraints don't have to make sense... (though on more complicated
|
||||
# sentences, they may cause the parse to fail)
|
||||
print(
|
||||
"forcing 'A' to be 'NNP':",
|
||||
next(bllip.tagged_parse([("A", "NNP"), ("tree", None)])),
|
||||
)
|
||||
1848
backend/venv/Lib/site-packages/nltk/parse/chart.py
Normal file
1848
backend/venv/Lib/site-packages/nltk/parse/chart.py
Normal file
File diff suppressed because it is too large
Load Diff
805
backend/venv/Lib/site-packages/nltk/parse/corenlp.py
Normal file
805
backend/venv/Lib/site-packages/nltk/parse/corenlp.py
Normal file
@@ -0,0 +1,805 @@
|
||||
# Natural Language Toolkit: Interface to the CoreNLP REST API.
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Dmitrijs Milajevs <dimazest@gmail.com>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import socket
|
||||
import time
|
||||
from typing import List, Tuple
|
||||
|
||||
from nltk.internals import _java_options, config_java, find_jar_iter, java
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.parse.dependencygraph import DependencyGraph
|
||||
from nltk.tag.api import TaggerI
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
from nltk.tree import Tree
|
||||
|
||||
_stanford_url = "https://stanfordnlp.github.io/CoreNLP/"
|
||||
|
||||
|
||||
class CoreNLPServerError(EnvironmentError):
|
||||
"""Exceptions associated with the Core NLP server."""
|
||||
|
||||
|
||||
def try_port(port=0):
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
sock.bind(("", port))
|
||||
|
||||
p = sock.getsockname()[1]
|
||||
sock.close()
|
||||
|
||||
return p
|
||||
|
||||
|
||||
class CoreNLPServer:
|
||||
_MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar"
|
||||
_JAR = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_to_jar=None,
|
||||
path_to_models_jar=None,
|
||||
verbose=False,
|
||||
java_options=None,
|
||||
corenlp_options=None,
|
||||
port=None,
|
||||
):
|
||||
if corenlp_options is None:
|
||||
corenlp_options = ["-preload", "tokenize,ssplit,pos,lemma,parse,depparse"]
|
||||
|
||||
jars = list(
|
||||
find_jar_iter(
|
||||
self._JAR,
|
||||
path_to_jar,
|
||||
env_vars=("CORENLP",),
|
||||
searchpath=(),
|
||||
url=_stanford_url,
|
||||
verbose=verbose,
|
||||
is_regex=True,
|
||||
)
|
||||
)
|
||||
|
||||
# find the most recent code and model jar
|
||||
stanford_jar = max(jars, key=lambda model_name: re.match(self._JAR, model_name))
|
||||
|
||||
if port is None:
|
||||
try:
|
||||
port = try_port(9000)
|
||||
except OSError:
|
||||
port = try_port()
|
||||
corenlp_options.extend(["-port", str(port)])
|
||||
else:
|
||||
try_port(port)
|
||||
corenlp_options.extend(["-port", str(port)])
|
||||
|
||||
self.url = f"http://localhost:{port}"
|
||||
|
||||
model_jar = max(
|
||||
find_jar_iter(
|
||||
self._MODEL_JAR_PATTERN,
|
||||
path_to_models_jar,
|
||||
env_vars=("CORENLP_MODELS",),
|
||||
searchpath=(),
|
||||
url=_stanford_url,
|
||||
verbose=verbose,
|
||||
is_regex=True,
|
||||
),
|
||||
key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name),
|
||||
)
|
||||
|
||||
self.verbose = verbose
|
||||
|
||||
self._classpath = stanford_jar, model_jar
|
||||
|
||||
self.corenlp_options = corenlp_options
|
||||
self.java_options = java_options or ["-mx2g"]
|
||||
|
||||
def start(self, stdout="devnull", stderr="devnull"):
|
||||
"""Starts the CoreNLP server
|
||||
|
||||
:param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe'
|
||||
"""
|
||||
import requests
|
||||
|
||||
cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"]
|
||||
|
||||
if self.corenlp_options:
|
||||
cmd.extend(self.corenlp_options)
|
||||
|
||||
# Configure java.
|
||||
default_options = " ".join(_java_options)
|
||||
config_java(options=self.java_options, verbose=self.verbose)
|
||||
|
||||
try:
|
||||
self.popen = java(
|
||||
cmd,
|
||||
classpath=self._classpath,
|
||||
blocking=False,
|
||||
stdout=stdout,
|
||||
stderr=stderr,
|
||||
)
|
||||
finally:
|
||||
# Return java configurations to their default values.
|
||||
config_java(options=default_options, verbose=self.verbose)
|
||||
|
||||
# Check that the server is istill running.
|
||||
returncode = self.popen.poll()
|
||||
if returncode is not None:
|
||||
_, stderrdata = self.popen.communicate()
|
||||
raise CoreNLPServerError(
|
||||
returncode,
|
||||
"Could not start the server. "
|
||||
"The error was: {}".format(stderrdata.decode("ascii")),
|
||||
)
|
||||
|
||||
for i in range(30):
|
||||
try:
|
||||
response = requests.get(requests.compat.urljoin(self.url, "live"))
|
||||
except requests.exceptions.ConnectionError:
|
||||
time.sleep(1)
|
||||
else:
|
||||
if response.ok:
|
||||
break
|
||||
else:
|
||||
raise CoreNLPServerError("Could not connect to the server.")
|
||||
|
||||
for i in range(60):
|
||||
try:
|
||||
response = requests.get(requests.compat.urljoin(self.url, "ready"))
|
||||
except requests.exceptions.ConnectionError:
|
||||
time.sleep(1)
|
||||
else:
|
||||
if response.ok:
|
||||
break
|
||||
else:
|
||||
raise CoreNLPServerError("The server is not ready.")
|
||||
|
||||
def stop(self):
|
||||
self.popen.terminate()
|
||||
self.popen.wait()
|
||||
|
||||
def __enter__(self):
|
||||
self.start()
|
||||
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.stop()
|
||||
return False
|
||||
|
||||
|
||||
class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
|
||||
"""Interface to the CoreNLP Parser."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url="http://localhost:9000",
|
||||
encoding="utf8",
|
||||
tagtype=None,
|
||||
strict_json=True,
|
||||
):
|
||||
import requests
|
||||
|
||||
self.url = url
|
||||
self.encoding = encoding
|
||||
|
||||
if tagtype not in ["pos", "ner", None]:
|
||||
raise ValueError("tagtype must be either 'pos', 'ner' or None")
|
||||
|
||||
self.tagtype = tagtype
|
||||
self.strict_json = strict_json
|
||||
|
||||
self.session = requests.Session()
|
||||
|
||||
def parse_sents(self, sentences, *args, **kwargs):
|
||||
"""Parse multiple sentences.
|
||||
|
||||
Takes multiple sentences as a list where each sentence is a list of
|
||||
words. Each sentence will be automatically tagged with this
|
||||
CoreNLPParser instance's tagger.
|
||||
|
||||
If a whitespace exists inside a token, then the token will be treated as
|
||||
several tokens.
|
||||
|
||||
:param sentences: Input sentences to parse
|
||||
:type sentences: list(list(str))
|
||||
:rtype: iter(iter(Tree))
|
||||
"""
|
||||
# Converting list(list(str)) -> list(str)
|
||||
sentences = (" ".join(words) for words in sentences)
|
||||
return self.raw_parse_sents(sentences, *args, **kwargs)
|
||||
|
||||
def raw_parse(self, sentence, properties=None, *args, **kwargs):
|
||||
"""Parse a sentence.
|
||||
|
||||
Takes a sentence as a string; before parsing, it will be automatically
|
||||
tokenized and tagged by the CoreNLP Parser.
|
||||
|
||||
:param sentence: Input sentence to parse
|
||||
:type sentence: str
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
default_properties = {"tokenize.whitespace": "false"}
|
||||
default_properties.update(properties or {})
|
||||
|
||||
return next(
|
||||
self.raw_parse_sents(
|
||||
[sentence], properties=default_properties, *args, **kwargs
|
||||
)
|
||||
)
|
||||
|
||||
def api_call(self, data, properties=None, timeout=60):
|
||||
default_properties = {
|
||||
"outputFormat": "json",
|
||||
"annotators": "tokenize,pos,lemma,ssplit,{parser_annotator}".format(
|
||||
parser_annotator=self.parser_annotator
|
||||
),
|
||||
}
|
||||
|
||||
default_properties.update(properties or {})
|
||||
|
||||
response = self.session.post(
|
||||
self.url,
|
||||
params={"properties": json.dumps(default_properties)},
|
||||
data=data.encode(self.encoding),
|
||||
headers={"Content-Type": f"text/plain; charset={self.encoding}"},
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
return response.json(strict=self.strict_json)
|
||||
|
||||
def raw_parse_sents(
|
||||
self, sentences, verbose=False, properties=None, *args, **kwargs
|
||||
):
|
||||
"""Parse multiple sentences.
|
||||
|
||||
Takes multiple sentences as a list of strings. Each sentence will be
|
||||
automatically tokenized and tagged.
|
||||
|
||||
:param sentences: Input sentences to parse.
|
||||
:type sentences: list(str)
|
||||
:rtype: iter(iter(Tree))
|
||||
|
||||
"""
|
||||
default_properties = {
|
||||
# Only splits on '\n', never inside the sentence.
|
||||
"ssplit.eolonly": "true"
|
||||
}
|
||||
|
||||
default_properties.update(properties or {})
|
||||
|
||||
"""
|
||||
for sentence in sentences:
|
||||
parsed_data = self.api_call(sentence, properties=default_properties)
|
||||
|
||||
assert len(parsed_data['sentences']) == 1
|
||||
|
||||
for parse in parsed_data['sentences']:
|
||||
tree = self.make_tree(parse)
|
||||
yield iter([tree])
|
||||
"""
|
||||
parsed_data = self.api_call("\n".join(sentences), properties=default_properties)
|
||||
for parsed_sent in parsed_data["sentences"]:
|
||||
tree = self.make_tree(parsed_sent)
|
||||
yield iter([tree])
|
||||
|
||||
def parse_text(self, text, *args, **kwargs):
|
||||
"""Parse a piece of text.
|
||||
|
||||
The text might contain several sentences which will be split by CoreNLP.
|
||||
|
||||
:param str text: text to be split.
|
||||
:returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables?
|
||||
|
||||
"""
|
||||
parsed_data = self.api_call(text, *args, **kwargs)
|
||||
|
||||
for parse in parsed_data["sentences"]:
|
||||
yield self.make_tree(parse)
|
||||
|
||||
def tokenize(self, text, properties=None):
|
||||
"""Tokenize a string of text.
|
||||
|
||||
Skip these tests if CoreNLP is likely not ready.
|
||||
>>> from nltk.test.setup_fixt import check_jar
|
||||
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
|
||||
|
||||
The CoreNLP server can be started using the following notation, although
|
||||
we recommend the `with CoreNLPServer() as server:` context manager notation
|
||||
to ensure that the server is always stopped.
|
||||
>>> server = CoreNLPServer()
|
||||
>>> server.start()
|
||||
>>> parser = CoreNLPParser(url=server.url)
|
||||
|
||||
>>> text = 'Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.'
|
||||
>>> list(parser.tokenize(text))
|
||||
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
|
||||
>>> s = "The colour of the wall is blue."
|
||||
>>> list(
|
||||
... parser.tokenize(
|
||||
... 'The colour of the wall is blue.',
|
||||
... properties={'tokenize.options': 'americanize=true'},
|
||||
... )
|
||||
... )
|
||||
['The', 'colour', 'of', 'the', 'wall', 'is', 'blue', '.']
|
||||
>>> server.stop()
|
||||
|
||||
"""
|
||||
default_properties = {"annotators": "tokenize,ssplit"}
|
||||
|
||||
default_properties.update(properties or {})
|
||||
|
||||
result = self.api_call(text, properties=default_properties)
|
||||
|
||||
for sentence in result["sentences"]:
|
||||
for token in sentence["tokens"]:
|
||||
yield token["originalText"] or token["word"]
|
||||
|
||||
def tag_sents(self, sentences, properties=None):
|
||||
"""
|
||||
Tag multiple sentences.
|
||||
|
||||
Takes multiple sentences as a list where each sentence is a list of
|
||||
tokens.
|
||||
|
||||
:param sentences: Input sentences to tag
|
||||
:type sentences: list(list(str))
|
||||
:rtype: list(list(tuple(str, str))
|
||||
"""
|
||||
|
||||
# Converting list(list(str)) -> list(str)
|
||||
sentences = (" ".join(words) for words in sentences)
|
||||
|
||||
if properties is None:
|
||||
properties = {"tokenize.whitespace": "true", "ner.useSUTime": "false"}
|
||||
|
||||
return [sentences[0] for sentences in self.raw_tag_sents(sentences, properties)]
|
||||
|
||||
def tag(self, sentence: str, properties=None) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Tag a list of tokens.
|
||||
|
||||
:rtype: list(tuple(str, str))
|
||||
|
||||
Skip these tests if CoreNLP is likely not ready.
|
||||
>>> from nltk.test.setup_fixt import check_jar
|
||||
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
|
||||
|
||||
The CoreNLP server can be started using the following notation, although
|
||||
we recommend the `with CoreNLPServer() as server:` context manager notation
|
||||
to ensure that the server is always stopped.
|
||||
>>> server = CoreNLPServer()
|
||||
>>> server.start()
|
||||
>>> parser = CoreNLPParser(url=server.url, tagtype='ner')
|
||||
>>> tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
|
||||
>>> parser.tag(tokens) # doctest: +NORMALIZE_WHITESPACE
|
||||
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'),
|
||||
('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'STATE_OR_PROVINCE')]
|
||||
|
||||
>>> parser = CoreNLPParser(url=server.url, tagtype='pos')
|
||||
>>> tokens = "What is the airspeed of an unladen swallow ?".split()
|
||||
>>> parser.tag(tokens) # doctest: +NORMALIZE_WHITESPACE
|
||||
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
|
||||
('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
|
||||
('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
|
||||
>>> server.stop()
|
||||
"""
|
||||
return self.tag_sents([sentence], properties)[0]
|
||||
|
||||
def raw_tag_sents(self, sentences, properties=None):
|
||||
"""
|
||||
Tag multiple sentences.
|
||||
|
||||
Takes multiple sentences as a list where each sentence is a string.
|
||||
|
||||
:param sentences: Input sentences to tag
|
||||
:type sentences: list(str)
|
||||
:rtype: list(list(list(tuple(str, str)))
|
||||
"""
|
||||
default_properties = {
|
||||
"ssplit.isOneSentence": "true",
|
||||
"annotators": "tokenize,ssplit,",
|
||||
}
|
||||
default_properties.update(properties or {})
|
||||
|
||||
# Supports only 'pos' or 'ner' tags.
|
||||
assert self.tagtype in [
|
||||
"pos",
|
||||
"ner",
|
||||
], "CoreNLP tagger supports only 'pos' or 'ner' tags."
|
||||
default_properties["annotators"] += self.tagtype
|
||||
for sentence in sentences:
|
||||
tagged_data = self.api_call(sentence, properties=default_properties)
|
||||
yield [
|
||||
[
|
||||
(token["word"], token[self.tagtype])
|
||||
for token in tagged_sentence["tokens"]
|
||||
]
|
||||
for tagged_sentence in tagged_data["sentences"]
|
||||
]
|
||||
|
||||
|
||||
class CoreNLPParser(GenericCoreNLPParser):
|
||||
"""
|
||||
Skip these tests if CoreNLP is likely not ready.
|
||||
>>> from nltk.test.setup_fixt import check_jar
|
||||
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
|
||||
|
||||
The recommended usage of `CoreNLPParser` is using the context manager notation:
|
||||
>>> with CoreNLPServer() as server:
|
||||
... parser = CoreNLPParser(url=server.url)
|
||||
... next(
|
||||
... parser.raw_parse('The quick brown fox jumps over the lazy dog.')
|
||||
... ).pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||||
ROOT
|
||||
|
|
||||
S
|
||||
_______________|__________________________
|
||||
| VP |
|
||||
| _________|___ |
|
||||
| | PP |
|
||||
| | ________|___ |
|
||||
NP | | NP |
|
||||
____|__________ | | _______|____ |
|
||||
DT JJ JJ NN VBZ IN DT JJ NN .
|
||||
| | | | | | | | | |
|
||||
The quick brown fox jumps over the lazy dog .
|
||||
|
||||
Alternatively, the server can be started using the following notation.
|
||||
Note that `CoreNLPServer` does not need to be used if the CoreNLP server is started
|
||||
outside of Python.
|
||||
>>> server = CoreNLPServer()
|
||||
>>> server.start()
|
||||
>>> parser = CoreNLPParser(url=server.url)
|
||||
|
||||
>>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents(
|
||||
... [
|
||||
... 'The quick brown fox jumps over the lazy dog.',
|
||||
... 'The quick grey wolf jumps over the lazy fox.',
|
||||
... ]
|
||||
... )
|
||||
|
||||
>>> parse_fox.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||||
ROOT
|
||||
|
|
||||
S
|
||||
_______________|__________________________
|
||||
| VP |
|
||||
| _________|___ |
|
||||
| | PP |
|
||||
| | ________|___ |
|
||||
NP | | NP |
|
||||
____|__________ | | _______|____ |
|
||||
DT JJ JJ NN VBZ IN DT JJ NN .
|
||||
| | | | | | | | | |
|
||||
The quick brown fox jumps over the lazy dog .
|
||||
|
||||
>>> parse_wolf.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||||
ROOT
|
||||
|
|
||||
S
|
||||
_______________|__________________________
|
||||
| VP |
|
||||
| _________|___ |
|
||||
| | PP |
|
||||
| | ________|___ |
|
||||
NP | | NP |
|
||||
____|_________ | | _______|____ |
|
||||
DT JJ JJ NN VBZ IN DT JJ NN .
|
||||
| | | | | | | | | |
|
||||
The quick grey wolf jumps over the lazy fox .
|
||||
|
||||
>>> (parse_dog, ), (parse_friends, ) = parser.parse_sents(
|
||||
... [
|
||||
... "I 'm a dog".split(),
|
||||
... "This is my friends ' cat ( the tabby )".split(),
|
||||
... ]
|
||||
... )
|
||||
|
||||
>>> parse_dog.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||||
ROOT
|
||||
|
|
||||
S
|
||||
_______|____
|
||||
| VP
|
||||
| ________|___
|
||||
NP | NP
|
||||
| | ___|___
|
||||
PRP VBP DT NN
|
||||
| | | |
|
||||
I 'm a dog
|
||||
|
||||
>>> parse_friends.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||||
ROOT
|
||||
|
|
||||
S
|
||||
____|___________
|
||||
| VP
|
||||
| ___________|_____________
|
||||
| | NP
|
||||
| | _______|________________________
|
||||
| | NP | | |
|
||||
| | _____|_______ | | |
|
||||
NP | NP | | NP |
|
||||
| | ______|_________ | | ___|____ |
|
||||
DT VBZ PRP$ NNS POS NN -LRB- DT NN -RRB-
|
||||
| | | | | | | | | |
|
||||
This is my friends ' cat -LRB- the tabby -RRB-
|
||||
|
||||
>>> parse_john, parse_mary, = parser.parse_text(
|
||||
... 'John loves Mary. Mary walks.'
|
||||
... )
|
||||
|
||||
>>> parse_john.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||||
ROOT
|
||||
|
|
||||
S
|
||||
_____|_____________
|
||||
| VP |
|
||||
| ____|___ |
|
||||
NP | NP |
|
||||
| | | |
|
||||
NNP VBZ NNP .
|
||||
| | | |
|
||||
John loves Mary .
|
||||
|
||||
>>> parse_mary.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||||
ROOT
|
||||
|
|
||||
S
|
||||
_____|____
|
||||
NP VP |
|
||||
| | |
|
||||
NNP VBZ .
|
||||
| | |
|
||||
Mary walks .
|
||||
|
||||
Special cases
|
||||
|
||||
>>> next(
|
||||
... parser.raw_parse(
|
||||
... 'NASIRIYA, Iraq—Iraqi doctors who treated former prisoner of war '
|
||||
... 'Jessica Lynch have angrily dismissed claims made in her biography '
|
||||
... 'that she was raped by her Iraqi captors.'
|
||||
... )
|
||||
... ).height()
|
||||
14
|
||||
|
||||
>>> next(
|
||||
... parser.raw_parse(
|
||||
... "The broader Standard & Poor's 500 Index <.SPX> was 0.46 points lower, or "
|
||||
... '0.05 percent, at 997.02.'
|
||||
... )
|
||||
... ).height()
|
||||
11
|
||||
|
||||
>>> server.stop()
|
||||
"""
|
||||
|
||||
_OUTPUT_FORMAT = "penn"
|
||||
parser_annotator = "parse"
|
||||
|
||||
def make_tree(self, result):
|
||||
return Tree.fromstring(result["parse"])
|
||||
|
||||
|
||||
class CoreNLPDependencyParser(GenericCoreNLPParser):
|
||||
"""Dependency parser.
|
||||
|
||||
Skip these tests if CoreNLP is likely not ready.
|
||||
>>> from nltk.test.setup_fixt import check_jar
|
||||
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
|
||||
|
||||
The recommended usage of `CoreNLPParser` is using the context manager notation:
|
||||
>>> with CoreNLPServer() as server:
|
||||
... dep_parser = CoreNLPDependencyParser(url=server.url)
|
||||
... parse, = dep_parser.raw_parse(
|
||||
... 'The quick brown fox jumps over the lazy dog.'
|
||||
... )
|
||||
... print(parse.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||||
The DT 4 det
|
||||
quick JJ 4 amod
|
||||
brown JJ 4 amod
|
||||
fox NN 5 nsubj
|
||||
jumps VBZ 0 ROOT
|
||||
over IN 9 case
|
||||
the DT 9 det
|
||||
lazy JJ 9 amod
|
||||
dog NN 5 obl
|
||||
. . 5 punct
|
||||
|
||||
Alternatively, the server can be started using the following notation.
|
||||
Note that `CoreNLPServer` does not need to be used if the CoreNLP server is started
|
||||
outside of Python.
|
||||
>>> server = CoreNLPServer()
|
||||
>>> server.start()
|
||||
>>> dep_parser = CoreNLPDependencyParser(url=server.url)
|
||||
>>> parse, = dep_parser.raw_parse('The quick brown fox jumps over the lazy dog.')
|
||||
>>> print(parse.tree()) # doctest: +NORMALIZE_WHITESPACE
|
||||
(jumps (fox The quick brown) (dog over the lazy) .)
|
||||
|
||||
>>> for governor, dep, dependent in parse.triples():
|
||||
... print(governor, dep, dependent) # doctest: +NORMALIZE_WHITESPACE
|
||||
('jumps', 'VBZ') nsubj ('fox', 'NN')
|
||||
('fox', 'NN') det ('The', 'DT')
|
||||
('fox', 'NN') amod ('quick', 'JJ')
|
||||
('fox', 'NN') amod ('brown', 'JJ')
|
||||
('jumps', 'VBZ') obl ('dog', 'NN')
|
||||
('dog', 'NN') case ('over', 'IN')
|
||||
('dog', 'NN') det ('the', 'DT')
|
||||
('dog', 'NN') amod ('lazy', 'JJ')
|
||||
('jumps', 'VBZ') punct ('.', '.')
|
||||
|
||||
>>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents(
|
||||
... [
|
||||
... 'The quick brown fox jumps over the lazy dog.',
|
||||
... 'The quick grey wolf jumps over the lazy fox.',
|
||||
... ]
|
||||
... )
|
||||
>>> print(parse_fox.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||||
The DT 4 det
|
||||
quick JJ 4 amod
|
||||
brown JJ 4 amod
|
||||
fox NN 5 nsubj
|
||||
jumps VBZ 0 ROOT
|
||||
over IN 9 case
|
||||
the DT 9 det
|
||||
lazy JJ 9 amod
|
||||
dog NN 5 obl
|
||||
. . 5 punct
|
||||
|
||||
>>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||||
The DT 4 det
|
||||
quick JJ 4 amod
|
||||
grey JJ 4 amod
|
||||
wolf NN 5 nsubj
|
||||
jumps VBZ 0 ROOT
|
||||
over IN 9 case
|
||||
the DT 9 det
|
||||
lazy JJ 9 amod
|
||||
fox NN 5 obl
|
||||
. . 5 punct
|
||||
|
||||
>>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents(
|
||||
... [
|
||||
... "I 'm a dog".split(),
|
||||
... "This is my friends ' cat ( the tabby )".split(),
|
||||
... ]
|
||||
... )
|
||||
>>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||||
I PRP 4 nsubj
|
||||
'm VBP 4 cop
|
||||
a DT 4 det
|
||||
dog NN 0 ROOT
|
||||
|
||||
>>> print(parse_friends.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||||
This DT 6 nsubj
|
||||
is VBZ 6 cop
|
||||
my PRP$ 4 nmod:poss
|
||||
friends NNS 6 nmod:poss
|
||||
' POS 4 case
|
||||
cat NN 0 ROOT
|
||||
( -LRB- 9 punct
|
||||
the DT 9 det
|
||||
tabby NN 6 dep
|
||||
) -RRB- 9 punct
|
||||
|
||||
>>> parse_john, parse_mary, = dep_parser.parse_text(
|
||||
... 'John loves Mary. Mary walks.'
|
||||
... )
|
||||
|
||||
>>> print(parse_john.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||||
John NNP 2 nsubj
|
||||
loves VBZ 0 ROOT
|
||||
Mary NNP 2 obj
|
||||
. . 2 punct
|
||||
|
||||
>>> print(parse_mary.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||||
Mary NNP 2 nsubj
|
||||
walks VBZ 0 ROOT
|
||||
. . 2 punct
|
||||
|
||||
Special cases
|
||||
|
||||
Non-breaking space inside of a token.
|
||||
|
||||
>>> len(
|
||||
... next(
|
||||
... dep_parser.raw_parse(
|
||||
... 'Anhalt said children typically treat a 20-ounce soda bottle as one '
|
||||
... 'serving, while it actually contains 2 1/2 servings.'
|
||||
... )
|
||||
... ).nodes
|
||||
... )
|
||||
23
|
||||
|
||||
Phone numbers.
|
||||
|
||||
>>> len(
|
||||
... next(
|
||||
... dep_parser.raw_parse('This is not going to crash: 01 111 555.')
|
||||
... ).nodes
|
||||
... )
|
||||
10
|
||||
|
||||
>>> print(
|
||||
... next(
|
||||
... dep_parser.raw_parse('The underscore _ should not simply disappear.')
|
||||
... ).to_conll(4)
|
||||
... ) # doctest: +NORMALIZE_WHITESPACE
|
||||
The DT 2 det
|
||||
underscore NN 7 nsubj
|
||||
_ NFP 7 punct
|
||||
should MD 7 aux
|
||||
not RB 7 advmod
|
||||
simply RB 7 advmod
|
||||
disappear VB 0 ROOT
|
||||
. . 7 punct
|
||||
|
||||
>>> print(
|
||||
... next(
|
||||
... dep_parser.raw_parse(
|
||||
... 'for all of its insights into the dream world of teen life , and its electronic expression through '
|
||||
... 'cyber culture , the film gives no quarter to anyone seeking to pull a cohesive story out of its 2 '
|
||||
... '1/2-hour running time .'
|
||||
... )
|
||||
... ).to_conll(4)
|
||||
... ) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
|
||||
for IN 2 case
|
||||
all DT 24 obl
|
||||
of IN 5 case
|
||||
its PRP$ 5 nmod:poss
|
||||
insights NNS 2 nmod
|
||||
into IN 9 case
|
||||
the DT 9 det
|
||||
dream NN 9 compound
|
||||
world NN 5 nmod
|
||||
of IN 12 case
|
||||
teen NN 12 compound
|
||||
...
|
||||
|
||||
>>> server.stop()
|
||||
"""
|
||||
|
||||
_OUTPUT_FORMAT = "conll2007"
|
||||
parser_annotator = "depparse"
|
||||
|
||||
def make_tree(self, result):
|
||||
return DependencyGraph(
|
||||
(
|
||||
" ".join(n_items[1:]) # NLTK expects an iterable of strings...
|
||||
for n_items in sorted(transform(result))
|
||||
),
|
||||
cell_separator=" ", # To make sure that a non-breaking space is kept inside of a token.
|
||||
)
|
||||
|
||||
|
||||
def transform(sentence):
|
||||
for dependency in sentence["basicDependencies"]:
|
||||
dependent_index = dependency["dependent"]
|
||||
token = sentence["tokens"][dependent_index - 1]
|
||||
|
||||
# Return values that we don't know as '_'. Also, consider tag and ctag
|
||||
# to be equal.
|
||||
yield (
|
||||
dependent_index,
|
||||
"_",
|
||||
token["word"],
|
||||
token["lemma"],
|
||||
token["pos"],
|
||||
token["pos"],
|
||||
"_",
|
||||
str(dependency["governor"]),
|
||||
dependency["dep"],
|
||||
"_",
|
||||
"_",
|
||||
)
|
||||
799
backend/venv/Lib/site-packages/nltk/parse/dependencygraph.py
Normal file
799
backend/venv/Lib/site-packages/nltk/parse/dependencygraph.py
Normal file
@@ -0,0 +1,799 @@
|
||||
# Natural Language Toolkit: Dependency Grammars
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Jason Narad <jason.narad@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (modifications)
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
"""
|
||||
Tools for reading and writing dependency trees.
|
||||
The input is assumed to be in Malt-TAB format
|
||||
(https://stp.lingfil.uu.se/~nivre/research/MaltXML.html).
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from itertools import chain
|
||||
from pprint import pformat
|
||||
|
||||
from nltk.internals import find_binary
|
||||
from nltk.tree import Tree
|
||||
|
||||
#################################################################
|
||||
# DependencyGraph Class
|
||||
#################################################################
|
||||
|
||||
|
||||
class DependencyGraph:
|
||||
"""
|
||||
A container for the nodes and labelled edges of a dependency structure.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tree_str=None,
|
||||
cell_extractor=None,
|
||||
zero_based=False,
|
||||
cell_separator=None,
|
||||
top_relation_label="ROOT",
|
||||
):
|
||||
"""Dependency graph.
|
||||
|
||||
We place a dummy `TOP` node with the index 0, since the root node is
|
||||
often assigned 0 as its head. This also means that the indexing of the
|
||||
nodes corresponds directly to the Malt-TAB format, which starts at 1.
|
||||
|
||||
If zero-based is True, then Malt-TAB-like input with node numbers
|
||||
starting at 0 and the root node assigned -1 (as produced by, e.g.,
|
||||
zpar).
|
||||
|
||||
:param str cell_separator: the cell separator. If not provided, cells
|
||||
are split by whitespace.
|
||||
|
||||
:param str top_relation_label: the label by which the top relation is
|
||||
identified, for examlple, `ROOT`, `null` or `TOP`.
|
||||
"""
|
||||
self.nodes = defaultdict(
|
||||
lambda: {
|
||||
"address": None,
|
||||
"word": None,
|
||||
"lemma": None,
|
||||
"ctag": None,
|
||||
"tag": None,
|
||||
"feats": None,
|
||||
"head": None,
|
||||
"deps": defaultdict(list),
|
||||
"rel": None,
|
||||
}
|
||||
)
|
||||
|
||||
self.nodes[0].update({"ctag": "TOP", "tag": "TOP", "address": 0})
|
||||
|
||||
self.root = None
|
||||
|
||||
if tree_str:
|
||||
self._parse(
|
||||
tree_str,
|
||||
cell_extractor=cell_extractor,
|
||||
zero_based=zero_based,
|
||||
cell_separator=cell_separator,
|
||||
top_relation_label=top_relation_label,
|
||||
)
|
||||
|
||||
def remove_by_address(self, address):
|
||||
"""
|
||||
Removes the node with the given address. References
|
||||
to this node in others will still exist.
|
||||
"""
|
||||
del self.nodes[address]
|
||||
|
||||
def redirect_arcs(self, originals, redirect):
|
||||
"""
|
||||
Redirects arcs to any of the nodes in the originals list
|
||||
to the redirect node address.
|
||||
"""
|
||||
for node in self.nodes.values():
|
||||
new_deps = []
|
||||
for dep in node["deps"]:
|
||||
if dep in originals:
|
||||
new_deps.append(redirect)
|
||||
else:
|
||||
new_deps.append(dep)
|
||||
node["deps"] = new_deps
|
||||
|
||||
def add_arc(self, head_address, mod_address):
|
||||
"""
|
||||
Adds an arc from the node specified by head_address to the
|
||||
node specified by the mod address.
|
||||
"""
|
||||
relation = self.nodes[mod_address]["rel"]
|
||||
self.nodes[head_address]["deps"].setdefault(relation, [])
|
||||
self.nodes[head_address]["deps"][relation].append(mod_address)
|
||||
# self.nodes[head_address]['deps'].append(mod_address)
|
||||
|
||||
def connect_graph(self):
|
||||
"""
|
||||
Fully connects all non-root nodes. All nodes are set to be dependents
|
||||
of the root node.
|
||||
"""
|
||||
for node1 in self.nodes.values():
|
||||
for node2 in self.nodes.values():
|
||||
if node1["address"] != node2["address"] and node2["rel"] != "TOP":
|
||||
relation = node2["rel"]
|
||||
node1["deps"].setdefault(relation, [])
|
||||
node1["deps"][relation].append(node2["address"])
|
||||
# node1['deps'].append(node2['address'])
|
||||
|
||||
def get_by_address(self, node_address):
|
||||
"""Return the node with the given address."""
|
||||
return self.nodes[node_address]
|
||||
|
||||
def contains_address(self, node_address):
|
||||
"""
|
||||
Returns true if the graph contains a node with the given node
|
||||
address, false otherwise.
|
||||
"""
|
||||
return node_address in self.nodes
|
||||
|
||||
def to_dot(self):
|
||||
"""Return a dot representation suitable for using with Graphviz.
|
||||
|
||||
>>> dg = DependencyGraph(
|
||||
... 'John N 2\\n'
|
||||
... 'loves V 0\\n'
|
||||
... 'Mary N 2'
|
||||
... )
|
||||
>>> print(dg.to_dot())
|
||||
digraph G{
|
||||
edge [dir=forward]
|
||||
node [shape=plaintext]
|
||||
<BLANKLINE>
|
||||
0 [label="0 (None)"]
|
||||
0 -> 2 [label="ROOT"]
|
||||
1 [label="1 (John)"]
|
||||
2 [label="2 (loves)"]
|
||||
2 -> 1 [label=""]
|
||||
2 -> 3 [label=""]
|
||||
3 [label="3 (Mary)"]
|
||||
}
|
||||
|
||||
"""
|
||||
# Start the digraph specification
|
||||
s = "digraph G{\n"
|
||||
s += "edge [dir=forward]\n"
|
||||
s += "node [shape=plaintext]\n"
|
||||
|
||||
# Draw the remaining nodes
|
||||
for node in sorted(self.nodes.values(), key=lambda v: v["address"]):
|
||||
s += '\n{} [label="{} ({})"]'.format(
|
||||
node["address"],
|
||||
node["address"],
|
||||
node["word"],
|
||||
)
|
||||
for rel, deps in node["deps"].items():
|
||||
for dep in deps:
|
||||
if rel is not None:
|
||||
s += '\n{} -> {} [label="{}"]'.format(node["address"], dep, rel)
|
||||
else:
|
||||
s += "\n{} -> {} ".format(node["address"], dep)
|
||||
s += "\n}"
|
||||
|
||||
return s
|
||||
|
||||
def _repr_svg_(self):
|
||||
"""Show SVG representation of the transducer (IPython magic).
|
||||
>>> from nltk.test.setup_fixt import check_binary
|
||||
>>> check_binary('dot')
|
||||
>>> dg = DependencyGraph(
|
||||
... 'John N 2\\n'
|
||||
... 'loves V 0\\n'
|
||||
... 'Mary N 2'
|
||||
... )
|
||||
>>> dg._repr_svg_().split('\\n')[0]
|
||||
'<?xml version="1.0" encoding="UTF-8" standalone="no"?>'
|
||||
|
||||
"""
|
||||
dot_string = self.to_dot()
|
||||
return dot2img(dot_string)
|
||||
|
||||
def __str__(self):
|
||||
return pformat(self.nodes)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<DependencyGraph with {len(self.nodes)} nodes>"
|
||||
|
||||
@staticmethod
|
||||
def load(
|
||||
filename, zero_based=False, cell_separator=None, top_relation_label="ROOT"
|
||||
):
|
||||
"""
|
||||
:param filename: a name of a file in Malt-TAB format
|
||||
:param zero_based: nodes in the input file are numbered starting from 0
|
||||
rather than 1 (as produced by, e.g., zpar)
|
||||
:param str cell_separator: the cell separator. If not provided, cells
|
||||
are split by whitespace.
|
||||
:param str top_relation_label: the label by which the top relation is
|
||||
identified, for examlple, `ROOT`, `null` or `TOP`.
|
||||
|
||||
:return: a list of DependencyGraphs
|
||||
|
||||
"""
|
||||
with open(filename) as infile:
|
||||
return [
|
||||
DependencyGraph(
|
||||
tree_str,
|
||||
zero_based=zero_based,
|
||||
cell_separator=cell_separator,
|
||||
top_relation_label=top_relation_label,
|
||||
)
|
||||
for tree_str in infile.read().split("\n\n")
|
||||
]
|
||||
|
||||
def left_children(self, node_index):
|
||||
"""
|
||||
Returns the number of left children under the node specified
|
||||
by the given address.
|
||||
"""
|
||||
children = chain.from_iterable(self.nodes[node_index]["deps"].values())
|
||||
index = self.nodes[node_index]["address"]
|
||||
return sum(1 for c in children if c < index)
|
||||
|
||||
def right_children(self, node_index):
|
||||
"""
|
||||
Returns the number of right children under the node specified
|
||||
by the given address.
|
||||
"""
|
||||
children = chain.from_iterable(self.nodes[node_index]["deps"].values())
|
||||
index = self.nodes[node_index]["address"]
|
||||
return sum(1 for c in children if c > index)
|
||||
|
||||
def add_node(self, node):
|
||||
if not self.contains_address(node["address"]):
|
||||
self.nodes[node["address"]].update(node)
|
||||
|
||||
def _parse(
|
||||
self,
|
||||
input_,
|
||||
cell_extractor=None,
|
||||
zero_based=False,
|
||||
cell_separator=None,
|
||||
top_relation_label="ROOT",
|
||||
):
|
||||
"""Parse a sentence.
|
||||
|
||||
:param extractor: a function that given a tuple of cells returns a
|
||||
7-tuple, where the values are ``word, lemma, ctag, tag, feats, head,
|
||||
rel``.
|
||||
|
||||
:param str cell_separator: the cell separator. If not provided, cells
|
||||
are split by whitespace.
|
||||
|
||||
:param str top_relation_label: the label by which the top relation is
|
||||
identified, for examlple, `ROOT`, `null` or `TOP`.
|
||||
|
||||
"""
|
||||
|
||||
def extract_3_cells(cells, index):
|
||||
word, tag, head = cells
|
||||
return index, word, word, tag, tag, "", head, ""
|
||||
|
||||
def extract_4_cells(cells, index):
|
||||
word, tag, head, rel = cells
|
||||
return index, word, word, tag, tag, "", head, rel
|
||||
|
||||
def extract_7_cells(cells, index):
|
||||
line_index, word, lemma, tag, _, head, rel = cells
|
||||
try:
|
||||
index = int(line_index)
|
||||
except ValueError:
|
||||
# index can't be parsed as an integer, use default
|
||||
pass
|
||||
return index, word, lemma, tag, tag, "", head, rel
|
||||
|
||||
def extract_10_cells(cells, index):
|
||||
line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells
|
||||
try:
|
||||
index = int(line_index)
|
||||
except ValueError:
|
||||
# index can't be parsed as an integer, use default
|
||||
pass
|
||||
return index, word, lemma, ctag, tag, feats, head, rel
|
||||
|
||||
extractors = {
|
||||
3: extract_3_cells,
|
||||
4: extract_4_cells,
|
||||
7: extract_7_cells,
|
||||
10: extract_10_cells,
|
||||
}
|
||||
|
||||
if isinstance(input_, str):
|
||||
input_ = (line for line in input_.split("\n"))
|
||||
|
||||
lines = (l.rstrip() for l in input_)
|
||||
lines = (l for l in lines if l)
|
||||
|
||||
cell_number = None
|
||||
for index, line in enumerate(lines, start=1):
|
||||
cells = line.split(cell_separator)
|
||||
if cell_number is None:
|
||||
cell_number = len(cells)
|
||||
else:
|
||||
assert cell_number == len(cells)
|
||||
|
||||
if cell_extractor is None:
|
||||
try:
|
||||
cell_extractor = extractors[cell_number]
|
||||
except KeyError as e:
|
||||
raise ValueError(
|
||||
"Number of tab-delimited fields ({}) not supported by "
|
||||
"CoNLL(10) or Malt-Tab(4) format".format(cell_number)
|
||||
) from e
|
||||
|
||||
try:
|
||||
index, word, lemma, ctag, tag, feats, head, rel = cell_extractor(
|
||||
cells, index
|
||||
)
|
||||
except (TypeError, ValueError):
|
||||
# cell_extractor doesn't take 2 arguments or doesn't return 8
|
||||
# values; assume the cell_extractor is an older external
|
||||
# extractor and doesn't accept or return an index.
|
||||
word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
|
||||
|
||||
if head == "_":
|
||||
continue
|
||||
|
||||
head = int(head)
|
||||
if zero_based:
|
||||
head += 1
|
||||
|
||||
self.nodes[index].update(
|
||||
{
|
||||
"address": index,
|
||||
"word": word,
|
||||
"lemma": lemma,
|
||||
"ctag": ctag,
|
||||
"tag": tag,
|
||||
"feats": feats,
|
||||
"head": head,
|
||||
"rel": rel,
|
||||
}
|
||||
)
|
||||
|
||||
# Make sure that the fake root node has labeled dependencies.
|
||||
if (cell_number == 3) and (head == 0):
|
||||
rel = top_relation_label
|
||||
self.nodes[head]["deps"][rel].append(index)
|
||||
|
||||
if self.nodes[0]["deps"][top_relation_label]:
|
||||
root_address = self.nodes[0]["deps"][top_relation_label][0]
|
||||
self.root = self.nodes[root_address]
|
||||
self.top_relation_label = top_relation_label
|
||||
else:
|
||||
warnings.warn(
|
||||
"The graph doesn't contain a node " "that depends on the root element."
|
||||
)
|
||||
|
||||
def _word(self, node, filter=True):
|
||||
w = node["word"]
|
||||
if filter:
|
||||
if w != ",":
|
||||
return w
|
||||
return w
|
||||
|
||||
def _tree(self, i):
|
||||
"""Turn dependency graphs into NLTK trees.
|
||||
|
||||
:param int i: index of a node
|
||||
:return: either a word (if the indexed node is a leaf) or a ``Tree``.
|
||||
"""
|
||||
node = self.get_by_address(i)
|
||||
word = node["word"]
|
||||
deps = sorted(chain.from_iterable(node["deps"].values()))
|
||||
|
||||
if deps:
|
||||
return Tree(word, [self._tree(dep) for dep in deps])
|
||||
else:
|
||||
return word
|
||||
|
||||
def tree(self):
|
||||
"""
|
||||
Starting with the ``root`` node, build a dependency tree using the NLTK
|
||||
``Tree`` constructor. Dependency labels are omitted.
|
||||
"""
|
||||
node = self.root
|
||||
|
||||
word = node["word"]
|
||||
deps = sorted(chain.from_iterable(node["deps"].values()))
|
||||
return Tree(word, [self._tree(dep) for dep in deps])
|
||||
|
||||
def triples(self, node=None):
|
||||
"""
|
||||
Extract dependency triples of the form:
|
||||
((head word, head tag), rel, (dep word, dep tag))
|
||||
"""
|
||||
|
||||
if not node:
|
||||
node = self.root
|
||||
|
||||
head = (node["word"], node["ctag"])
|
||||
for i in sorted(chain.from_iterable(node["deps"].values())):
|
||||
dep = self.get_by_address(i)
|
||||
yield (head, dep["rel"], (dep["word"], dep["ctag"]))
|
||||
yield from self.triples(node=dep)
|
||||
|
||||
def _hd(self, i):
|
||||
try:
|
||||
return self.nodes[i]["head"]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
def _rel(self, i):
|
||||
try:
|
||||
return self.nodes[i]["rel"]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
# what's the return type? Boolean or list?
|
||||
def contains_cycle(self):
|
||||
"""Check whether there are cycles.
|
||||
|
||||
>>> dg = DependencyGraph(treebank_data)
|
||||
>>> dg.contains_cycle()
|
||||
False
|
||||
|
||||
>>> cyclic_dg = DependencyGraph()
|
||||
>>> top = {'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0}
|
||||
>>> child1 = {'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1}
|
||||
>>> child2 = {'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2}
|
||||
>>> child3 = {'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3}
|
||||
>>> child4 = {'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4}
|
||||
>>> cyclic_dg.nodes = {
|
||||
... 0: top,
|
||||
... 1: child1,
|
||||
... 2: child2,
|
||||
... 3: child3,
|
||||
... 4: child4,
|
||||
... }
|
||||
>>> cyclic_dg.root = top
|
||||
|
||||
>>> cyclic_dg.contains_cycle()
|
||||
[1, 2, 4, 3]
|
||||
|
||||
"""
|
||||
distances = {}
|
||||
|
||||
for node in self.nodes.values():
|
||||
for dep in node["deps"]:
|
||||
key = tuple([node["address"], dep])
|
||||
distances[key] = 1
|
||||
|
||||
for _ in self.nodes:
|
||||
new_entries = {}
|
||||
|
||||
for pair1 in distances:
|
||||
for pair2 in distances:
|
||||
if pair1[1] == pair2[0]:
|
||||
key = tuple([pair1[0], pair2[1]])
|
||||
new_entries[key] = distances[pair1] + distances[pair2]
|
||||
|
||||
for pair in new_entries:
|
||||
distances[pair] = new_entries[pair]
|
||||
if pair[0] == pair[1]:
|
||||
path = self.get_cycle_path(self.get_by_address(pair[0]), pair[0])
|
||||
return path
|
||||
|
||||
return False # return []?
|
||||
|
||||
def get_cycle_path(self, curr_node, goal_node_index):
|
||||
for dep in curr_node["deps"]:
|
||||
if dep == goal_node_index:
|
||||
return [curr_node["address"]]
|
||||
for dep in curr_node["deps"]:
|
||||
path = self.get_cycle_path(self.get_by_address(dep), goal_node_index)
|
||||
if len(path) > 0:
|
||||
path.insert(0, curr_node["address"])
|
||||
return path
|
||||
return []
|
||||
|
||||
def to_conll(self, style):
|
||||
"""
|
||||
The dependency graph in CoNLL format.
|
||||
|
||||
:param style: the style to use for the format (3, 4, 10 columns)
|
||||
:type style: int
|
||||
:rtype: str
|
||||
"""
|
||||
|
||||
if style == 3:
|
||||
template = "{word}\t{tag}\t{head}\n"
|
||||
elif style == 4:
|
||||
template = "{word}\t{tag}\t{head}\t{rel}\n"
|
||||
elif style == 10:
|
||||
template = (
|
||||
"{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n"
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Number of tab-delimited fields ({}) not supported by "
|
||||
"CoNLL(10) or Malt-Tab(4) format".format(style)
|
||||
)
|
||||
|
||||
return "".join(
|
||||
template.format(i=i, **node)
|
||||
for i, node in sorted(self.nodes.items())
|
||||
if node["tag"] != "TOP"
|
||||
)
|
||||
|
||||
def nx_graph(self):
|
||||
"""Convert the data in a ``nodelist`` into a networkx labeled directed graph."""
|
||||
import networkx
|
||||
|
||||
nx_nodelist = list(range(1, len(self.nodes)))
|
||||
nx_edgelist = [
|
||||
(n, self._hd(n), self._rel(n)) for n in nx_nodelist if self._hd(n)
|
||||
]
|
||||
self.nx_labels = {}
|
||||
for n in nx_nodelist:
|
||||
self.nx_labels[n] = self.nodes[n]["word"]
|
||||
|
||||
g = networkx.MultiDiGraph()
|
||||
g.add_nodes_from(nx_nodelist)
|
||||
g.add_edges_from(nx_edgelist)
|
||||
|
||||
return g
|
||||
|
||||
|
||||
def dot2img(dot_string, t="svg"):
|
||||
"""
|
||||
Create image representation fom dot_string, using the 'dot' program
|
||||
from the Graphviz package.
|
||||
|
||||
Use the 't' argument to specify the image file format, for ex. 'jpeg', 'eps',
|
||||
'json', 'png' or 'webp' (Running 'dot -T:' lists all available formats).
|
||||
|
||||
Note that the "capture_output" option of subprocess.run() is only available
|
||||
with text formats (like svg), but not with binary image formats (like png).
|
||||
"""
|
||||
|
||||
try:
|
||||
find_binary("dot")
|
||||
try:
|
||||
if t in ["dot", "dot_json", "json", "svg"]:
|
||||
proc = subprocess.run(
|
||||
["dot", "-T%s" % t],
|
||||
capture_output=True,
|
||||
input=dot_string,
|
||||
text=True,
|
||||
)
|
||||
else:
|
||||
proc = subprocess.run(
|
||||
["dot", "-T%s" % t],
|
||||
input=bytes(dot_string, encoding="utf8"),
|
||||
)
|
||||
return proc.stdout
|
||||
except:
|
||||
raise Exception(
|
||||
"Cannot create image representation by running dot from string: {}"
|
||||
"".format(dot_string)
|
||||
)
|
||||
except OSError as e:
|
||||
raise Exception("Cannot find the dot binary from Graphviz package") from e
|
||||
|
||||
|
||||
class DependencyGraphError(Exception):
|
||||
"""Dependency graph exception."""
|
||||
|
||||
|
||||
def demo():
|
||||
malt_demo()
|
||||
conll_demo()
|
||||
conll_file_demo()
|
||||
cycle_finding_demo()
|
||||
|
||||
|
||||
def malt_demo(nx=False):
|
||||
"""
|
||||
A demonstration of the result of reading a dependency
|
||||
version of the first sentence of the Penn Treebank.
|
||||
"""
|
||||
dg = DependencyGraph(
|
||||
"""Pierre NNP 2 NMOD
|
||||
Vinken NNP 8 SUB
|
||||
, , 2 P
|
||||
61 CD 5 NMOD
|
||||
years NNS 6 AMOD
|
||||
old JJ 2 NMOD
|
||||
, , 2 P
|
||||
will MD 0 ROOT
|
||||
join VB 8 VC
|
||||
the DT 11 NMOD
|
||||
board NN 9 OBJ
|
||||
as IN 9 VMOD
|
||||
a DT 15 NMOD
|
||||
nonexecutive JJ 15 NMOD
|
||||
director NN 12 PMOD
|
||||
Nov. NNP 9 VMOD
|
||||
29 CD 16 NMOD
|
||||
. . 9 VMOD
|
||||
"""
|
||||
)
|
||||
tree = dg.tree()
|
||||
tree.pprint()
|
||||
if nx:
|
||||
# currently doesn't work
|
||||
import networkx
|
||||
from matplotlib import pylab
|
||||
|
||||
g = dg.nx_graph()
|
||||
g.info()
|
||||
pos = networkx.spring_layout(g, dim=1)
|
||||
networkx.draw_networkx_nodes(g, pos, node_size=50)
|
||||
# networkx.draw_networkx_edges(g, pos, edge_color='k', width=8)
|
||||
networkx.draw_networkx_labels(g, pos, dg.nx_labels)
|
||||
pylab.xticks([])
|
||||
pylab.yticks([])
|
||||
pylab.savefig("tree.png")
|
||||
pylab.show()
|
||||
|
||||
|
||||
def conll_demo():
|
||||
"""
|
||||
A demonstration of how to read a string representation of
|
||||
a CoNLL format dependency tree.
|
||||
"""
|
||||
dg = DependencyGraph(conll_data1)
|
||||
tree = dg.tree()
|
||||
tree.pprint()
|
||||
print(dg)
|
||||
print(dg.to_conll(4))
|
||||
|
||||
|
||||
def conll_file_demo():
|
||||
print("Mass conll_read demo...")
|
||||
graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
|
||||
for graph in graphs:
|
||||
tree = graph.tree()
|
||||
print("\n")
|
||||
tree.pprint()
|
||||
|
||||
|
||||
def cycle_finding_demo():
|
||||
dg = DependencyGraph(treebank_data)
|
||||
print(dg.contains_cycle())
|
||||
cyclic_dg = DependencyGraph()
|
||||
cyclic_dg.add_node({"word": None, "deps": [1], "rel": "TOP", "address": 0})
|
||||
cyclic_dg.add_node({"word": None, "deps": [2], "rel": "NTOP", "address": 1})
|
||||
cyclic_dg.add_node({"word": None, "deps": [4], "rel": "NTOP", "address": 2})
|
||||
cyclic_dg.add_node({"word": None, "deps": [1], "rel": "NTOP", "address": 3})
|
||||
cyclic_dg.add_node({"word": None, "deps": [3], "rel": "NTOP", "address": 4})
|
||||
print(cyclic_dg.contains_cycle())
|
||||
|
||||
|
||||
treebank_data = """Pierre NNP 2 NMOD
|
||||
Vinken NNP 8 SUB
|
||||
, , 2 P
|
||||
61 CD 5 NMOD
|
||||
years NNS 6 AMOD
|
||||
old JJ 2 NMOD
|
||||
, , 2 P
|
||||
will MD 0 ROOT
|
||||
join VB 8 VC
|
||||
the DT 11 NMOD
|
||||
board NN 9 OBJ
|
||||
as IN 9 VMOD
|
||||
a DT 15 NMOD
|
||||
nonexecutive JJ 15 NMOD
|
||||
director NN 12 PMOD
|
||||
Nov. NNP 9 VMOD
|
||||
29 CD 16 NMOD
|
||||
. . 9 VMOD
|
||||
"""
|
||||
|
||||
conll_data1 = """
|
||||
1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
|
||||
2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _
|
||||
3 met met Prep Prep voor 8 mod _ _
|
||||
4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _
|
||||
5 moeder moeder N N soort|ev|neut 3 obj1 _ _
|
||||
6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _
|
||||
7 gaan ga V V hulp|inf 6 vc _ _
|
||||
8 winkelen winkel V V intrans|inf 11 cnj _ _
|
||||
9 , , Punc Punc komma 8 punct _ _
|
||||
10 zwemmen zwem V V intrans|inf 11 cnj _ _
|
||||
11 of of Conj Conj neven 7 vc _ _
|
||||
12 terrassen terras N N soort|mv|neut 11 cnj _ _
|
||||
13 . . Punc Punc punt 12 punct _ _
|
||||
"""
|
||||
|
||||
conll_data2 = """1 Cathy Cathy N N eigen|ev|neut 2 su _ _
|
||||
2 zag zie V V trans|ovt|1of2of3|ev 0 ROOT _ _
|
||||
3 hen hen Pron Pron per|3|mv|datofacc 2 obj1 _ _
|
||||
4 wild wild Adj Adj attr|stell|onverv 5 mod _ _
|
||||
5 zwaaien zwaai N N soort|mv|neut 2 vc _ _
|
||||
6 . . Punc Punc punt 5 punct _ _
|
||||
|
||||
1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
|
||||
2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _
|
||||
3 met met Prep Prep voor 8 mod _ _
|
||||
4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _
|
||||
5 moeder moeder N N soort|ev|neut 3 obj1 _ _
|
||||
6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _
|
||||
7 gaan ga V V hulp|inf 6 vc _ _
|
||||
8 winkelen winkel V V intrans|inf 11 cnj _ _
|
||||
9 , , Punc Punc komma 8 punct _ _
|
||||
10 zwemmen zwem V V intrans|inf 11 cnj _ _
|
||||
11 of of Conj Conj neven 7 vc _ _
|
||||
12 terrassen terras N N soort|mv|neut 11 cnj _ _
|
||||
13 . . Punc Punc punt 12 punct _ _
|
||||
|
||||
1 Dat dat Pron Pron aanw|neut|attr 2 det _ _
|
||||
2 werkwoord werkwoord N N soort|ev|neut 6 obj1 _ _
|
||||
3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _
|
||||
4 ze ze Pron Pron per|3|evofmv|nom 6 su _ _
|
||||
5 zelf zelf Pron Pron aanw|neut|attr|wzelf 3 predm _ _
|
||||
6 uitgevonden vind V V trans|verldw|onverv 3 vc _ _
|
||||
7 . . Punc Punc punt 6 punct _ _
|
||||
|
||||
1 Het het Pron Pron onbep|neut|zelfst 2 su _ _
|
||||
2 hoorde hoor V V trans|ovt|1of2of3|ev 0 ROOT _ _
|
||||
3 bij bij Prep Prep voor 2 ld _ _
|
||||
4 de de Art Art bep|zijdofmv|neut 6 det _ _
|
||||
5 warme warm Adj Adj attr|stell|vervneut 6 mod _ _
|
||||
6 zomerdag zomerdag N N soort|ev|neut 3 obj1 _ _
|
||||
7 die die Pron Pron betr|neut|zelfst 6 mod _ _
|
||||
8 ze ze Pron Pron per|3|evofmv|nom 12 su _ _
|
||||
9 ginds ginds Adv Adv gew|aanw 12 mod _ _
|
||||
10 achter achter Adv Adv gew|geenfunc|stell|onverv 12 svp _ _
|
||||
11 had heb V V hulp|ovt|1of2of3|ev 7 body _ _
|
||||
12 gelaten laat V V trans|verldw|onverv 11 vc _ _
|
||||
13 . . Punc Punc punt 12 punct _ _
|
||||
|
||||
1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
|
||||
2 hadden heb V V trans|ovt|1of2of3|mv 0 ROOT _ _
|
||||
3 languit languit Adv Adv gew|geenfunc|stell|onverv 11 mod _ _
|
||||
4 naast naast Prep Prep voor 11 mod _ _
|
||||
5 elkaar elkaar Pron Pron rec|neut 4 obj1 _ _
|
||||
6 op op Prep Prep voor 11 ld _ _
|
||||
7 de de Art Art bep|zijdofmv|neut 8 det _ _
|
||||
8 strandstoelen strandstoel N N soort|mv|neut 6 obj1 _ _
|
||||
9 kunnen kan V V hulp|inf 2 vc _ _
|
||||
10 gaan ga V V hulp|inf 9 vc _ _
|
||||
11 liggen lig V V intrans|inf 10 vc _ _
|
||||
12 . . Punc Punc punt 11 punct _ _
|
||||
|
||||
1 Zij zij Pron Pron per|3|evofmv|nom 2 su _ _
|
||||
2 zou zal V V hulp|ovt|1of2of3|ev 7 cnj _ _
|
||||
3 mams mams N N soort|ev|neut 4 det _ _
|
||||
4 rug rug N N soort|ev|neut 5 obj1 _ _
|
||||
5 ingewreven wrijf V V trans|verldw|onverv 6 vc _ _
|
||||
6 hebben heb V V hulp|inf 2 vc _ _
|
||||
7 en en Conj Conj neven 0 ROOT _ _
|
||||
8 mam mam V V trans|ovt|1of2of3|ev 7 cnj _ _
|
||||
9 de de Art Art bep|zijdofmv|neut 10 det _ _
|
||||
10 hare hare Pron Pron bez|3|ev|neut|attr 8 obj1 _ _
|
||||
11 . . Punc Punc punt 10 punct _ _
|
||||
|
||||
1 Of of Conj Conj onder|metfin 0 ROOT _ _
|
||||
2 ze ze Pron Pron per|3|evofmv|nom 3 su _ _
|
||||
3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _
|
||||
4 gewoon gewoon Adj Adj adv|stell|onverv 10 mod _ _
|
||||
5 met met Prep Prep voor 10 mod _ _
|
||||
6 haar haar Pron Pron bez|3|ev|neut|attr 7 det _ _
|
||||
7 vriendinnen vriendin N N soort|mv|neut 5 obj1 _ _
|
||||
8 rond rond Adv Adv deelv 10 svp _ _
|
||||
9 kunnen kan V V hulp|inf 3 vc _ _
|
||||
10 slenteren slenter V V intrans|inf 9 vc _ _
|
||||
11 in in Prep Prep voor 10 mod _ _
|
||||
12 de de Art Art bep|zijdofmv|neut 13 det _ _
|
||||
13 buurt buurt N N soort|ev|neut 11 obj1 _ _
|
||||
14 van van Prep Prep voor 13 mod _ _
|
||||
15 Trafalgar_Square Trafalgar_Square MWU N_N eigen|ev|neut_eigen|ev|neut 14 obj1 _ _
|
||||
16 . . Punc Punc punt 15 punct _ _
|
||||
"""
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
552
backend/venv/Lib/site-packages/nltk/parse/earleychart.py
Normal file
552
backend/venv/Lib/site-packages/nltk/parse/earleychart.py
Normal file
@@ -0,0 +1,552 @@
|
||||
# Natural Language Toolkit: An Incremental Earley Chart Parser
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
|
||||
# Rob Speer <rspeer@mit.edu>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Jean Mark Gawron <gawron@mail.sdsu.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Data classes and parser implementations for *incremental* chart
|
||||
parsers, which use dynamic programming to efficiently parse a text.
|
||||
A "chart parser" derives parse trees for a text by iteratively adding
|
||||
\"edges\" to a \"chart\". Each "edge" represents a hypothesis about the tree
|
||||
structure for a subsequence of the text. The "chart" is a
|
||||
\"blackboard\" for composing and combining these hypotheses.
|
||||
|
||||
A parser is "incremental", if it guarantees that for all i, j where i < j,
|
||||
all edges ending at i are built before any edges ending at j.
|
||||
This is appealing for, say, speech recognizer hypothesis filtering.
|
||||
|
||||
The main parser class is ``EarleyChartParser``, which is a top-down
|
||||
algorithm, originally formulated by Jay Earley (1970).
|
||||
"""
|
||||
|
||||
from time import perf_counter
|
||||
|
||||
from nltk.parse.chart import (
|
||||
BottomUpPredictCombineRule,
|
||||
BottomUpPredictRule,
|
||||
CachedTopDownPredictRule,
|
||||
Chart,
|
||||
ChartParser,
|
||||
EdgeI,
|
||||
EmptyPredictRule,
|
||||
FilteredBottomUpPredictCombineRule,
|
||||
FilteredSingleEdgeFundamentalRule,
|
||||
LeafEdge,
|
||||
LeafInitRule,
|
||||
SingleEdgeFundamentalRule,
|
||||
TopDownInitRule,
|
||||
)
|
||||
from nltk.parse.featurechart import (
|
||||
FeatureBottomUpPredictCombineRule,
|
||||
FeatureBottomUpPredictRule,
|
||||
FeatureChart,
|
||||
FeatureChartParser,
|
||||
FeatureEmptyPredictRule,
|
||||
FeatureSingleEdgeFundamentalRule,
|
||||
FeatureTopDownInitRule,
|
||||
FeatureTopDownPredictRule,
|
||||
)
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Incremental Chart
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class IncrementalChart(Chart):
|
||||
def initialize(self):
|
||||
# A sequence of edge lists contained in this chart.
|
||||
self._edgelists = tuple([] for x in self._positions())
|
||||
|
||||
# The set of child pointer lists associated with each edge.
|
||||
self._edge_to_cpls = {}
|
||||
|
||||
# Indexes mapping attribute values to lists of edges
|
||||
# (used by select()).
|
||||
self._indexes = {}
|
||||
|
||||
def edges(self):
|
||||
return list(self.iteredges())
|
||||
|
||||
def iteredges(self):
|
||||
return (edge for edgelist in self._edgelists for edge in edgelist)
|
||||
|
||||
def select(self, end, **restrictions):
|
||||
edgelist = self._edgelists[end]
|
||||
|
||||
# If there are no restrictions, then return all edges.
|
||||
if restrictions == {}:
|
||||
return iter(edgelist)
|
||||
|
||||
# Find the index corresponding to the given restrictions.
|
||||
restr_keys = sorted(restrictions.keys())
|
||||
restr_keys = tuple(restr_keys)
|
||||
|
||||
# If it doesn't exist, then create it.
|
||||
if restr_keys not in self._indexes:
|
||||
self._add_index(restr_keys)
|
||||
|
||||
vals = tuple(restrictions[key] for key in restr_keys)
|
||||
return iter(self._indexes[restr_keys][end].get(vals, []))
|
||||
|
||||
def _add_index(self, restr_keys):
|
||||
# Make sure it's a valid index.
|
||||
for key in restr_keys:
|
||||
if not hasattr(EdgeI, key):
|
||||
raise ValueError("Bad restriction: %s" % key)
|
||||
|
||||
# Create the index.
|
||||
index = self._indexes[restr_keys] = tuple({} for x in self._positions())
|
||||
|
||||
# Add all existing edges to the index.
|
||||
for end, edgelist in enumerate(self._edgelists):
|
||||
this_index = index[end]
|
||||
for edge in edgelist:
|
||||
vals = tuple(getattr(edge, key)() for key in restr_keys)
|
||||
this_index.setdefault(vals, []).append(edge)
|
||||
|
||||
def _register_with_indexes(self, edge):
|
||||
end = edge.end()
|
||||
for restr_keys, index in self._indexes.items():
|
||||
vals = tuple(getattr(edge, key)() for key in restr_keys)
|
||||
index[end].setdefault(vals, []).append(edge)
|
||||
|
||||
def _append_edge(self, edge):
|
||||
self._edgelists[edge.end()].append(edge)
|
||||
|
||||
def _positions(self):
|
||||
return range(self.num_leaves() + 1)
|
||||
|
||||
|
||||
class FeatureIncrementalChart(IncrementalChart, FeatureChart):
|
||||
def select(self, end, **restrictions):
|
||||
edgelist = self._edgelists[end]
|
||||
|
||||
# If there are no restrictions, then return all edges.
|
||||
if restrictions == {}:
|
||||
return iter(edgelist)
|
||||
|
||||
# Find the index corresponding to the given restrictions.
|
||||
restr_keys = sorted(restrictions.keys())
|
||||
restr_keys = tuple(restr_keys)
|
||||
|
||||
# If it doesn't exist, then create it.
|
||||
if restr_keys not in self._indexes:
|
||||
self._add_index(restr_keys)
|
||||
|
||||
vals = tuple(
|
||||
self._get_type_if_possible(restrictions[key]) for key in restr_keys
|
||||
)
|
||||
return iter(self._indexes[restr_keys][end].get(vals, []))
|
||||
|
||||
def _add_index(self, restr_keys):
|
||||
# Make sure it's a valid index.
|
||||
for key in restr_keys:
|
||||
if not hasattr(EdgeI, key):
|
||||
raise ValueError("Bad restriction: %s" % key)
|
||||
|
||||
# Create the index.
|
||||
index = self._indexes[restr_keys] = tuple({} for x in self._positions())
|
||||
|
||||
# Add all existing edges to the index.
|
||||
for end, edgelist in enumerate(self._edgelists):
|
||||
this_index = index[end]
|
||||
for edge in edgelist:
|
||||
vals = tuple(
|
||||
self._get_type_if_possible(getattr(edge, key)())
|
||||
for key in restr_keys
|
||||
)
|
||||
this_index.setdefault(vals, []).append(edge)
|
||||
|
||||
def _register_with_indexes(self, edge):
|
||||
end = edge.end()
|
||||
for restr_keys, index in self._indexes.items():
|
||||
vals = tuple(
|
||||
self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
|
||||
)
|
||||
index[end].setdefault(vals, []).append(edge)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Incremental CFG Rules
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class CompleteFundamentalRule(SingleEdgeFundamentalRule):
|
||||
def _apply_incomplete(self, chart, grammar, left_edge):
|
||||
end = left_edge.end()
|
||||
# When the chart is incremental, we only have to look for
|
||||
# empty complete edges here.
|
||||
for right_edge in chart.select(
|
||||
start=end, end=end, is_complete=True, lhs=left_edge.nextsym()
|
||||
):
|
||||
new_edge = left_edge.move_dot_forward(right_edge.end())
|
||||
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
|
||||
yield new_edge
|
||||
|
||||
|
||||
class CompleterRule(CompleteFundamentalRule):
|
||||
_fundamental_rule = CompleteFundamentalRule()
|
||||
|
||||
def apply(self, chart, grammar, edge):
|
||||
if not isinstance(edge, LeafEdge):
|
||||
yield from self._fundamental_rule.apply(chart, grammar, edge)
|
||||
|
||||
|
||||
class ScannerRule(CompleteFundamentalRule):
|
||||
_fundamental_rule = CompleteFundamentalRule()
|
||||
|
||||
def apply(self, chart, grammar, edge):
|
||||
if isinstance(edge, LeafEdge):
|
||||
yield from self._fundamental_rule.apply(chart, grammar, edge)
|
||||
|
||||
|
||||
class PredictorRule(CachedTopDownPredictRule):
|
||||
pass
|
||||
|
||||
|
||||
class FilteredCompleteFundamentalRule(FilteredSingleEdgeFundamentalRule):
|
||||
def apply(self, chart, grammar, edge):
|
||||
# Since the Filtered rule only works for grammars without empty productions,
|
||||
# we only have to bother with complete edges here.
|
||||
if edge.is_complete():
|
||||
yield from self._apply_complete(chart, grammar, edge)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Incremental FCFG Rules
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class FeatureCompleteFundamentalRule(FeatureSingleEdgeFundamentalRule):
|
||||
def _apply_incomplete(self, chart, grammar, left_edge):
|
||||
fr = self._fundamental_rule
|
||||
end = left_edge.end()
|
||||
# When the chart is incremental, we only have to look for
|
||||
# empty complete edges here.
|
||||
for right_edge in chart.select(
|
||||
start=end, end=end, is_complete=True, lhs=left_edge.nextsym()
|
||||
):
|
||||
yield from fr.apply(chart, grammar, left_edge, right_edge)
|
||||
|
||||
|
||||
class FeatureCompleterRule(CompleterRule):
|
||||
_fundamental_rule = FeatureCompleteFundamentalRule()
|
||||
|
||||
|
||||
class FeatureScannerRule(ScannerRule):
|
||||
_fundamental_rule = FeatureCompleteFundamentalRule()
|
||||
|
||||
|
||||
class FeaturePredictorRule(FeatureTopDownPredictRule):
|
||||
pass
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Incremental CFG Chart Parsers
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
EARLEY_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
TopDownInitRule(),
|
||||
CompleterRule(),
|
||||
ScannerRule(),
|
||||
PredictorRule(),
|
||||
]
|
||||
TD_INCREMENTAL_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
TopDownInitRule(),
|
||||
CachedTopDownPredictRule(),
|
||||
CompleteFundamentalRule(),
|
||||
]
|
||||
BU_INCREMENTAL_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
EmptyPredictRule(),
|
||||
BottomUpPredictRule(),
|
||||
CompleteFundamentalRule(),
|
||||
]
|
||||
BU_LC_INCREMENTAL_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
EmptyPredictRule(),
|
||||
BottomUpPredictCombineRule(),
|
||||
CompleteFundamentalRule(),
|
||||
]
|
||||
|
||||
LC_INCREMENTAL_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FilteredBottomUpPredictCombineRule(),
|
||||
FilteredCompleteFundamentalRule(),
|
||||
]
|
||||
|
||||
|
||||
class IncrementalChartParser(ChartParser):
|
||||
"""
|
||||
An *incremental* chart parser implementing Jay Earley's
|
||||
parsing algorithm:
|
||||
|
||||
| For each index end in [0, 1, ..., N]:
|
||||
| For each edge such that edge.end = end:
|
||||
| If edge is incomplete and edge.next is not a part of speech:
|
||||
| Apply PredictorRule to edge
|
||||
| If edge is incomplete and edge.next is a part of speech:
|
||||
| Apply ScannerRule to edge
|
||||
| If edge is complete:
|
||||
| Apply CompleterRule to edge
|
||||
| Return any complete parses in the chart
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
grammar,
|
||||
strategy=BU_LC_INCREMENTAL_STRATEGY,
|
||||
trace=0,
|
||||
trace_chart_width=50,
|
||||
chart_class=IncrementalChart,
|
||||
):
|
||||
"""
|
||||
Create a new Earley chart parser, that uses ``grammar`` to
|
||||
parse texts.
|
||||
|
||||
:type grammar: CFG
|
||||
:param grammar: The grammar used to parse texts.
|
||||
:type trace: int
|
||||
:param trace: The level of tracing that should be used when
|
||||
parsing a text. ``0`` will generate no tracing output;
|
||||
and higher numbers will produce more verbose tracing
|
||||
output.
|
||||
:type trace_chart_width: int
|
||||
:param trace_chart_width: The default total width reserved for
|
||||
the chart in trace output. The remainder of each line will
|
||||
be used to display edges.
|
||||
:param chart_class: The class that should be used to create
|
||||
the charts used by this parser.
|
||||
"""
|
||||
self._grammar = grammar
|
||||
self._trace = trace
|
||||
self._trace_chart_width = trace_chart_width
|
||||
self._chart_class = chart_class
|
||||
|
||||
self._axioms = []
|
||||
self._inference_rules = []
|
||||
for rule in strategy:
|
||||
if rule.NUM_EDGES == 0:
|
||||
self._axioms.append(rule)
|
||||
elif rule.NUM_EDGES == 1:
|
||||
self._inference_rules.append(rule)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Incremental inference rules must have " "NUM_EDGES == 0 or 1"
|
||||
)
|
||||
|
||||
def chart_parse(self, tokens, trace=None):
|
||||
if trace is None:
|
||||
trace = self._trace
|
||||
trace_new_edges = self._trace_new_edges
|
||||
|
||||
tokens = list(tokens)
|
||||
self._grammar.check_coverage(tokens)
|
||||
chart = self._chart_class(tokens)
|
||||
grammar = self._grammar
|
||||
|
||||
# Width, for printing trace edges.
|
||||
trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
|
||||
if trace:
|
||||
print(chart.pretty_format_leaves(trace_edge_width))
|
||||
|
||||
for axiom in self._axioms:
|
||||
new_edges = list(axiom.apply(chart, grammar))
|
||||
trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
|
||||
|
||||
inference_rules = self._inference_rules
|
||||
for end in range(chart.num_leaves() + 1):
|
||||
if trace > 1:
|
||||
print("\n* Processing queue:", end, "\n")
|
||||
agenda = list(chart.select(end=end))
|
||||
while agenda:
|
||||
edge = agenda.pop()
|
||||
for rule in inference_rules:
|
||||
new_edges = list(rule.apply(chart, grammar, edge))
|
||||
trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
|
||||
for new_edge in new_edges:
|
||||
if new_edge.end() == end:
|
||||
agenda.append(new_edge)
|
||||
|
||||
return chart
|
||||
|
||||
|
||||
class EarleyChartParser(IncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
IncrementalChartParser.__init__(self, grammar, EARLEY_STRATEGY, **parser_args)
|
||||
|
||||
|
||||
class IncrementalTopDownChartParser(IncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
IncrementalChartParser.__init__(
|
||||
self, grammar, TD_INCREMENTAL_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
class IncrementalBottomUpChartParser(IncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
IncrementalChartParser.__init__(
|
||||
self, grammar, BU_INCREMENTAL_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
class IncrementalBottomUpLeftCornerChartParser(IncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
IncrementalChartParser.__init__(
|
||||
self, grammar, BU_LC_INCREMENTAL_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
class IncrementalLeftCornerChartParser(IncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
if not grammar.is_nonempty():
|
||||
raise ValueError(
|
||||
"IncrementalLeftCornerParser only works for grammars "
|
||||
"without empty productions."
|
||||
)
|
||||
IncrementalChartParser.__init__(
|
||||
self, grammar, LC_INCREMENTAL_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Incremental FCFG Chart Parsers
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
EARLEY_FEATURE_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FeatureTopDownInitRule(),
|
||||
FeatureCompleterRule(),
|
||||
FeatureScannerRule(),
|
||||
FeaturePredictorRule(),
|
||||
]
|
||||
TD_INCREMENTAL_FEATURE_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FeatureTopDownInitRule(),
|
||||
FeatureTopDownPredictRule(),
|
||||
FeatureCompleteFundamentalRule(),
|
||||
]
|
||||
BU_INCREMENTAL_FEATURE_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FeatureEmptyPredictRule(),
|
||||
FeatureBottomUpPredictRule(),
|
||||
FeatureCompleteFundamentalRule(),
|
||||
]
|
||||
BU_LC_INCREMENTAL_FEATURE_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FeatureEmptyPredictRule(),
|
||||
FeatureBottomUpPredictCombineRule(),
|
||||
FeatureCompleteFundamentalRule(),
|
||||
]
|
||||
|
||||
|
||||
class FeatureIncrementalChartParser(IncrementalChartParser, FeatureChartParser):
|
||||
def __init__(
|
||||
self,
|
||||
grammar,
|
||||
strategy=BU_LC_INCREMENTAL_FEATURE_STRATEGY,
|
||||
trace_chart_width=20,
|
||||
chart_class=FeatureIncrementalChart,
|
||||
**parser_args
|
||||
):
|
||||
IncrementalChartParser.__init__(
|
||||
self,
|
||||
grammar,
|
||||
strategy=strategy,
|
||||
trace_chart_width=trace_chart_width,
|
||||
chart_class=chart_class,
|
||||
**parser_args
|
||||
)
|
||||
|
||||
|
||||
class FeatureEarleyChartParser(FeatureIncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
FeatureIncrementalChartParser.__init__(
|
||||
self, grammar, EARLEY_FEATURE_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
class FeatureIncrementalTopDownChartParser(FeatureIncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
FeatureIncrementalChartParser.__init__(
|
||||
self, grammar, TD_INCREMENTAL_FEATURE_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
class FeatureIncrementalBottomUpChartParser(FeatureIncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
FeatureIncrementalChartParser.__init__(
|
||||
self, grammar, BU_INCREMENTAL_FEATURE_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
class FeatureIncrementalBottomUpLeftCornerChartParser(FeatureIncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
FeatureIncrementalChartParser.__init__(
|
||||
self, grammar, BU_LC_INCREMENTAL_FEATURE_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Demonstration
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo(
|
||||
print_times=True,
|
||||
print_grammar=False,
|
||||
print_trees=True,
|
||||
trace=2,
|
||||
sent="I saw John with a dog with my cookie",
|
||||
numparses=5,
|
||||
):
|
||||
"""
|
||||
A demonstration of the Earley parsers.
|
||||
"""
|
||||
import sys
|
||||
import time
|
||||
|
||||
from nltk.parse.chart import demo_grammar
|
||||
|
||||
# The grammar for ChartParser and SteppingChartParser:
|
||||
grammar = demo_grammar()
|
||||
if print_grammar:
|
||||
print("* Grammar")
|
||||
print(grammar)
|
||||
|
||||
# Tokenize the sample sentence.
|
||||
print("* Sentence:")
|
||||
print(sent)
|
||||
tokens = sent.split()
|
||||
print(tokens)
|
||||
print()
|
||||
|
||||
# Do the parsing.
|
||||
earley = EarleyChartParser(grammar, trace=trace)
|
||||
t = perf_counter()
|
||||
chart = earley.chart_parse(tokens)
|
||||
parses = list(chart.parses(grammar.start()))
|
||||
t = perf_counter() - t
|
||||
|
||||
# Print results.
|
||||
if numparses:
|
||||
assert len(parses) == numparses, "Not all parses found"
|
||||
if print_trees:
|
||||
for tree in parses:
|
||||
print(tree)
|
||||
else:
|
||||
print("Nr trees:", len(parses))
|
||||
if print_times:
|
||||
print("Time:", t)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
129
backend/venv/Lib/site-packages/nltk/parse/evaluate.py
Normal file
129
backend/venv/Lib/site-packages/nltk/parse/evaluate.py
Normal file
@@ -0,0 +1,129 @@
|
||||
# Natural Language Toolkit: evaluation of dependency parser
|
||||
#
|
||||
# Author: Long Duong <longdt219@gmail.com>
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import unicodedata
|
||||
|
||||
|
||||
class DependencyEvaluator:
|
||||
"""
|
||||
Class for measuring labelled and unlabelled attachment score for
|
||||
dependency parsing. Note that the evaluation ignores punctuation.
|
||||
|
||||
>>> from nltk.parse import DependencyGraph, DependencyEvaluator
|
||||
|
||||
>>> gold_sent = DependencyGraph(\"""
|
||||
... Pierre NNP 2 NMOD
|
||||
... Vinken NNP 8 SUB
|
||||
... , , 2 P
|
||||
... 61 CD 5 NMOD
|
||||
... years NNS 6 AMOD
|
||||
... old JJ 2 NMOD
|
||||
... , , 2 P
|
||||
... will MD 0 ROOT
|
||||
... join VB 8 VC
|
||||
... the DT 11 NMOD
|
||||
... board NN 9 OBJ
|
||||
... as IN 9 VMOD
|
||||
... a DT 15 NMOD
|
||||
... nonexecutive JJ 15 NMOD
|
||||
... director NN 12 PMOD
|
||||
... Nov. NNP 9 VMOD
|
||||
... 29 CD 16 NMOD
|
||||
... . . 9 VMOD
|
||||
... \""")
|
||||
|
||||
>>> parsed_sent = DependencyGraph(\"""
|
||||
... Pierre NNP 8 NMOD
|
||||
... Vinken NNP 1 SUB
|
||||
... , , 3 P
|
||||
... 61 CD 6 NMOD
|
||||
... years NNS 6 AMOD
|
||||
... old JJ 2 NMOD
|
||||
... , , 3 AMOD
|
||||
... will MD 0 ROOT
|
||||
... join VB 8 VC
|
||||
... the DT 11 AMOD
|
||||
... board NN 9 OBJECT
|
||||
... as IN 9 NMOD
|
||||
... a DT 15 NMOD
|
||||
... nonexecutive JJ 15 NMOD
|
||||
... director NN 12 PMOD
|
||||
... Nov. NNP 9 VMOD
|
||||
... 29 CD 16 NMOD
|
||||
... . . 9 VMOD
|
||||
... \""")
|
||||
|
||||
>>> de = DependencyEvaluator([parsed_sent],[gold_sent])
|
||||
>>> las, uas = de.eval()
|
||||
>>> las
|
||||
0.6
|
||||
>>> uas
|
||||
0.8
|
||||
>>> abs(uas - 0.8) < 0.00001
|
||||
True
|
||||
"""
|
||||
|
||||
def __init__(self, parsed_sents, gold_sents):
|
||||
"""
|
||||
:param parsed_sents: the list of parsed_sents as the output of parser
|
||||
:type parsed_sents: list(DependencyGraph)
|
||||
"""
|
||||
self._parsed_sents = parsed_sents
|
||||
self._gold_sents = gold_sents
|
||||
|
||||
def _remove_punct(self, inStr):
|
||||
"""
|
||||
Function to remove punctuation from Unicode string.
|
||||
:param input: the input string
|
||||
:return: Unicode string after remove all punctuation
|
||||
"""
|
||||
punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"}
|
||||
return "".join(x for x in inStr if unicodedata.category(x) not in punc_cat)
|
||||
|
||||
def eval(self):
|
||||
"""
|
||||
Return the Labeled Attachment Score (LAS) and Unlabeled Attachment Score (UAS)
|
||||
|
||||
:return : tuple(float,float)
|
||||
"""
|
||||
if len(self._parsed_sents) != len(self._gold_sents):
|
||||
raise ValueError(
|
||||
" Number of parsed sentence is different with number of gold sentence."
|
||||
)
|
||||
|
||||
corr = 0
|
||||
corrL = 0
|
||||
total = 0
|
||||
|
||||
for i in range(len(self._parsed_sents)):
|
||||
parsed_sent_nodes = self._parsed_sents[i].nodes
|
||||
gold_sent_nodes = self._gold_sents[i].nodes
|
||||
|
||||
if len(parsed_sent_nodes) != len(gold_sent_nodes):
|
||||
raise ValueError("Sentences must have equal length.")
|
||||
|
||||
for parsed_node_address, parsed_node in parsed_sent_nodes.items():
|
||||
gold_node = gold_sent_nodes[parsed_node_address]
|
||||
|
||||
if parsed_node["word"] is None:
|
||||
continue
|
||||
if parsed_node["word"] != gold_node["word"]:
|
||||
raise ValueError("Sentence sequence is not matched.")
|
||||
|
||||
# Ignore if word is punctuation by default
|
||||
# if (parsed_sent[j]["word"] in string.punctuation):
|
||||
if self._remove_punct(parsed_node["word"]) == "":
|
||||
continue
|
||||
|
||||
total += 1
|
||||
if parsed_node["head"] == gold_node["head"]:
|
||||
corr += 1
|
||||
if parsed_node["rel"] == gold_node["rel"]:
|
||||
corrL += 1
|
||||
|
||||
return corrL / total, corr / total
|
||||
674
backend/venv/Lib/site-packages/nltk/parse/featurechart.py
Normal file
674
backend/venv/Lib/site-packages/nltk/parse/featurechart.py
Normal file
@@ -0,0 +1,674 @@
|
||||
# Natural Language Toolkit: Chart Parser for Feature-Based Grammars
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Rob Speer <rspeer@mit.edu>
|
||||
# Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Extension of chart parsing implementation to handle grammars with
|
||||
feature structures as nodes.
|
||||
"""
|
||||
from time import perf_counter
|
||||
|
||||
from nltk.featstruct import TYPE, FeatStruct, find_variables, unify
|
||||
from nltk.grammar import (
|
||||
CFG,
|
||||
FeatStructNonterminal,
|
||||
Nonterminal,
|
||||
Production,
|
||||
is_nonterminal,
|
||||
is_terminal,
|
||||
)
|
||||
from nltk.parse.chart import (
|
||||
BottomUpPredictCombineRule,
|
||||
BottomUpPredictRule,
|
||||
CachedTopDownPredictRule,
|
||||
Chart,
|
||||
ChartParser,
|
||||
EdgeI,
|
||||
EmptyPredictRule,
|
||||
FundamentalRule,
|
||||
LeafInitRule,
|
||||
SingleEdgeFundamentalRule,
|
||||
TopDownInitRule,
|
||||
TreeEdge,
|
||||
)
|
||||
from nltk.sem import logic
|
||||
from nltk.tree import Tree
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Tree Edge
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class FeatureTreeEdge(TreeEdge):
|
||||
"""
|
||||
A specialized tree edge that allows shared variable bindings
|
||||
between nonterminals on the left-hand side and right-hand side.
|
||||
|
||||
Each ``FeatureTreeEdge`` contains a set of ``bindings``, i.e., a
|
||||
dictionary mapping from variables to values. If the edge is not
|
||||
complete, then these bindings are simply stored. However, if the
|
||||
edge is complete, then the constructor applies these bindings to
|
||||
every nonterminal in the edge whose symbol implements the
|
||||
interface ``SubstituteBindingsI``.
|
||||
"""
|
||||
|
||||
def __init__(self, span, lhs, rhs, dot=0, bindings=None):
|
||||
"""
|
||||
Construct a new edge. If the edge is incomplete (i.e., if
|
||||
``dot<len(rhs)``), then store the bindings as-is. If the edge
|
||||
is complete (i.e., if ``dot==len(rhs)``), then apply the
|
||||
bindings to all nonterminals in ``lhs`` and ``rhs``, and then
|
||||
clear the bindings. See ``TreeEdge`` for a description of
|
||||
the other arguments.
|
||||
"""
|
||||
if bindings is None:
|
||||
bindings = {}
|
||||
|
||||
# If the edge is complete, then substitute in the bindings,
|
||||
# and then throw them away. (If we didn't throw them away, we
|
||||
# might think that 2 complete edges are different just because
|
||||
# they have different bindings, even though all bindings have
|
||||
# already been applied.)
|
||||
if dot == len(rhs) and bindings:
|
||||
lhs = self._bind(lhs, bindings)
|
||||
rhs = [self._bind(elt, bindings) for elt in rhs]
|
||||
bindings = {}
|
||||
|
||||
# Initialize the edge.
|
||||
TreeEdge.__init__(self, span, lhs, rhs, dot)
|
||||
self._bindings = bindings
|
||||
self._comparison_key = (self._comparison_key, tuple(sorted(bindings.items())))
|
||||
|
||||
@staticmethod
|
||||
def from_production(production, index):
|
||||
"""
|
||||
:return: A new ``TreeEdge`` formed from the given production.
|
||||
The new edge's left-hand side and right-hand side will
|
||||
be taken from ``production``; its span will be
|
||||
``(index,index)``; and its dot position will be ``0``.
|
||||
:rtype: TreeEdge
|
||||
"""
|
||||
return FeatureTreeEdge(
|
||||
span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0
|
||||
)
|
||||
|
||||
def move_dot_forward(self, new_end, bindings=None):
|
||||
"""
|
||||
:return: A new ``FeatureTreeEdge`` formed from this edge.
|
||||
The new edge's dot position is increased by ``1``,
|
||||
and its end index will be replaced by ``new_end``.
|
||||
:rtype: FeatureTreeEdge
|
||||
:param new_end: The new end index.
|
||||
:type new_end: int
|
||||
:param bindings: Bindings for the new edge.
|
||||
:type bindings: dict
|
||||
"""
|
||||
return FeatureTreeEdge(
|
||||
span=(self._span[0], new_end),
|
||||
lhs=self._lhs,
|
||||
rhs=self._rhs,
|
||||
dot=self._dot + 1,
|
||||
bindings=bindings,
|
||||
)
|
||||
|
||||
def _bind(self, nt, bindings):
|
||||
if not isinstance(nt, FeatStructNonterminal):
|
||||
return nt
|
||||
return nt.substitute_bindings(bindings)
|
||||
|
||||
def next_with_bindings(self):
|
||||
return self._bind(self.nextsym(), self._bindings)
|
||||
|
||||
def bindings(self):
|
||||
"""
|
||||
Return a copy of this edge's bindings dictionary.
|
||||
"""
|
||||
return self._bindings.copy()
|
||||
|
||||
def variables(self):
|
||||
"""
|
||||
:return: The set of variables used by this edge.
|
||||
:rtype: set(Variable)
|
||||
"""
|
||||
return find_variables(
|
||||
[self._lhs]
|
||||
+ list(self._rhs)
|
||||
+ list(self._bindings.keys())
|
||||
+ list(self._bindings.values()),
|
||||
fs_class=FeatStruct,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
if self.is_complete():
|
||||
return super().__str__()
|
||||
else:
|
||||
bindings = "{%s}" % ", ".join(
|
||||
"%s: %r" % item for item in sorted(self._bindings.items())
|
||||
)
|
||||
return f"{super().__str__()} {bindings}"
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# A specialized Chart for feature grammars
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
# TODO: subsumes check when adding new edges
|
||||
|
||||
|
||||
class FeatureChart(Chart):
|
||||
"""
|
||||
A Chart for feature grammars.
|
||||
:see: ``Chart`` for more information.
|
||||
"""
|
||||
|
||||
def select(self, **restrictions):
|
||||
"""
|
||||
Returns an iterator over the edges in this chart.
|
||||
See ``Chart.select`` for more information about the
|
||||
``restrictions`` on the edges.
|
||||
"""
|
||||
# If there are no restrictions, then return all edges.
|
||||
if restrictions == {}:
|
||||
return iter(self._edges)
|
||||
|
||||
# Find the index corresponding to the given restrictions.
|
||||
restr_keys = sorted(restrictions.keys())
|
||||
restr_keys = tuple(restr_keys)
|
||||
|
||||
# If it doesn't exist, then create it.
|
||||
if restr_keys not in self._indexes:
|
||||
self._add_index(restr_keys)
|
||||
|
||||
vals = tuple(
|
||||
self._get_type_if_possible(restrictions[key]) for key in restr_keys
|
||||
)
|
||||
return iter(self._indexes[restr_keys].get(vals, []))
|
||||
|
||||
def _add_index(self, restr_keys):
|
||||
"""
|
||||
A helper function for ``select``, which creates a new index for
|
||||
a given set of attributes (aka restriction keys).
|
||||
"""
|
||||
# Make sure it's a valid index.
|
||||
for key in restr_keys:
|
||||
if not hasattr(EdgeI, key):
|
||||
raise ValueError("Bad restriction: %s" % key)
|
||||
|
||||
# Create the index.
|
||||
index = self._indexes[restr_keys] = {}
|
||||
|
||||
# Add all existing edges to the index.
|
||||
for edge in self._edges:
|
||||
vals = tuple(
|
||||
self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
|
||||
)
|
||||
index.setdefault(vals, []).append(edge)
|
||||
|
||||
def _register_with_indexes(self, edge):
|
||||
"""
|
||||
A helper function for ``insert``, which registers the new
|
||||
edge with all existing indexes.
|
||||
"""
|
||||
for restr_keys, index in self._indexes.items():
|
||||
vals = tuple(
|
||||
self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
|
||||
)
|
||||
index.setdefault(vals, []).append(edge)
|
||||
|
||||
def _get_type_if_possible(self, item):
|
||||
"""
|
||||
Helper function which returns the ``TYPE`` feature of the ``item``,
|
||||
if it exists, otherwise it returns the ``item`` itself
|
||||
"""
|
||||
if isinstance(item, dict) and TYPE in item:
|
||||
return item[TYPE]
|
||||
else:
|
||||
return item
|
||||
|
||||
def parses(self, start, tree_class=Tree):
|
||||
for edge in self.select(start=0, end=self._num_leaves):
|
||||
if (
|
||||
(isinstance(edge, FeatureTreeEdge))
|
||||
and (edge.lhs()[TYPE] == start[TYPE])
|
||||
and (unify(edge.lhs(), start, rename_vars=True))
|
||||
):
|
||||
yield from self.trees(edge, complete=True, tree_class=tree_class)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Fundamental Rule
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class FeatureFundamentalRule(FundamentalRule):
|
||||
r"""
|
||||
A specialized version of the fundamental rule that operates on
|
||||
nonterminals whose symbols are ``FeatStructNonterminal``s. Rather
|
||||
than simply comparing the nonterminals for equality, they are
|
||||
unified. Variable bindings from these unifications are collected
|
||||
and stored in the chart using a ``FeatureTreeEdge``. When a
|
||||
complete edge is generated, these bindings are applied to all
|
||||
nonterminals in the edge.
|
||||
|
||||
The fundamental rule states that:
|
||||
|
||||
- ``[A -> alpha \* B1 beta][i:j]``
|
||||
- ``[B2 -> gamma \*][j:k]``
|
||||
|
||||
licenses the edge:
|
||||
|
||||
- ``[A -> alpha B3 \* beta][i:j]``
|
||||
|
||||
assuming that B1 and B2 can be unified to generate B3.
|
||||
"""
|
||||
|
||||
def apply(self, chart, grammar, left_edge, right_edge):
|
||||
# Make sure the rule is applicable.
|
||||
if not (
|
||||
left_edge.end() == right_edge.start()
|
||||
and left_edge.is_incomplete()
|
||||
and right_edge.is_complete()
|
||||
and isinstance(left_edge, FeatureTreeEdge)
|
||||
):
|
||||
return
|
||||
found = right_edge.lhs()
|
||||
nextsym = left_edge.nextsym()
|
||||
if isinstance(right_edge, FeatureTreeEdge):
|
||||
if not is_nonterminal(nextsym):
|
||||
return
|
||||
if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]:
|
||||
return
|
||||
# Create a copy of the bindings.
|
||||
bindings = left_edge.bindings()
|
||||
# We rename vars here, because we don't want variables
|
||||
# from the two different productions to match.
|
||||
found = found.rename_variables(used_vars=left_edge.variables())
|
||||
# Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to
|
||||
# generate B3 (result).
|
||||
result = unify(nextsym, found, bindings, rename_vars=False)
|
||||
if result is None:
|
||||
return
|
||||
else:
|
||||
if nextsym != found:
|
||||
return
|
||||
# Create a copy of the bindings.
|
||||
bindings = left_edge.bindings()
|
||||
|
||||
# Construct the new edge.
|
||||
new_edge = left_edge.move_dot_forward(right_edge.end(), bindings)
|
||||
|
||||
# Add it to the chart, with appropriate child pointers.
|
||||
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
|
||||
yield new_edge
|
||||
|
||||
|
||||
class FeatureSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
|
||||
"""
|
||||
A specialized version of the completer / single edge fundamental rule
|
||||
that operates on nonterminals whose symbols are ``FeatStructNonterminal``.
|
||||
Rather than simply comparing the nonterminals for equality, they are
|
||||
unified.
|
||||
"""
|
||||
|
||||
_fundamental_rule = FeatureFundamentalRule()
|
||||
|
||||
def _apply_complete(self, chart, grammar, right_edge):
|
||||
fr = self._fundamental_rule
|
||||
for left_edge in chart.select(
|
||||
end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
|
||||
):
|
||||
yield from fr.apply(chart, grammar, left_edge, right_edge)
|
||||
|
||||
def _apply_incomplete(self, chart, grammar, left_edge):
|
||||
fr = self._fundamental_rule
|
||||
for right_edge in chart.select(
|
||||
start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
|
||||
):
|
||||
yield from fr.apply(chart, grammar, left_edge, right_edge)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Top-Down Prediction
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class FeatureTopDownInitRule(TopDownInitRule):
|
||||
def apply(self, chart, grammar):
|
||||
for prod in grammar.productions(lhs=grammar.start()):
|
||||
new_edge = FeatureTreeEdge.from_production(prod, 0)
|
||||
if chart.insert(new_edge, ()):
|
||||
yield new_edge
|
||||
|
||||
|
||||
class FeatureTopDownPredictRule(CachedTopDownPredictRule):
|
||||
r"""
|
||||
A specialized version of the (cached) top down predict rule that operates
|
||||
on nonterminals whose symbols are ``FeatStructNonterminal``. Rather
|
||||
than simply comparing the nonterminals for equality, they are
|
||||
unified.
|
||||
|
||||
The top down expand rule states that:
|
||||
|
||||
- ``[A -> alpha \* B1 beta][i:j]``
|
||||
|
||||
licenses the edge:
|
||||
|
||||
- ``[B2 -> \* gamma][j:j]``
|
||||
|
||||
for each grammar production ``B2 -> gamma``, assuming that B1
|
||||
and B2 can be unified.
|
||||
"""
|
||||
|
||||
def apply(self, chart, grammar, edge):
|
||||
if edge.is_complete():
|
||||
return
|
||||
nextsym, index = edge.nextsym(), edge.end()
|
||||
if not is_nonterminal(nextsym):
|
||||
return
|
||||
|
||||
# If we've already applied this rule to an edge with the same
|
||||
# next & end, and the chart & grammar have not changed, then
|
||||
# just return (no new edges to add).
|
||||
nextsym_with_bindings = edge.next_with_bindings()
|
||||
done = self._done.get((nextsym_with_bindings, index), (None, None))
|
||||
if done[0] is chart and done[1] is grammar:
|
||||
return
|
||||
|
||||
for prod in grammar.productions(lhs=nextsym):
|
||||
# If the left corner in the predicted production is
|
||||
# leaf, it must match with the input.
|
||||
if prod.rhs():
|
||||
first = prod.rhs()[0]
|
||||
if is_terminal(first):
|
||||
if index >= chart.num_leaves():
|
||||
continue
|
||||
if first != chart.leaf(index):
|
||||
continue
|
||||
|
||||
# We rename vars here, because we don't want variables
|
||||
# from the two different productions to match.
|
||||
if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True):
|
||||
new_edge = FeatureTreeEdge.from_production(prod, edge.end())
|
||||
if chart.insert(new_edge, ()):
|
||||
yield new_edge
|
||||
|
||||
# Record the fact that we've applied this rule.
|
||||
self._done[nextsym_with_bindings, index] = (chart, grammar)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Bottom-Up Prediction
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class FeatureBottomUpPredictRule(BottomUpPredictRule):
|
||||
def apply(self, chart, grammar, edge):
|
||||
if edge.is_incomplete():
|
||||
return
|
||||
for prod in grammar.productions(rhs=edge.lhs()):
|
||||
if isinstance(edge, FeatureTreeEdge):
|
||||
_next = prod.rhs()[0]
|
||||
if not is_nonterminal(_next):
|
||||
continue
|
||||
|
||||
new_edge = FeatureTreeEdge.from_production(prod, edge.start())
|
||||
if chart.insert(new_edge, ()):
|
||||
yield new_edge
|
||||
|
||||
|
||||
class FeatureBottomUpPredictCombineRule(BottomUpPredictCombineRule):
|
||||
def apply(self, chart, grammar, edge):
|
||||
if edge.is_incomplete():
|
||||
return
|
||||
found = edge.lhs()
|
||||
for prod in grammar.productions(rhs=found):
|
||||
bindings = {}
|
||||
if isinstance(edge, FeatureTreeEdge):
|
||||
_next = prod.rhs()[0]
|
||||
if not is_nonterminal(_next):
|
||||
continue
|
||||
|
||||
# We rename vars here, because we don't want variables
|
||||
# from the two different productions to match.
|
||||
used_vars = find_variables(
|
||||
(prod.lhs(),) + prod.rhs(), fs_class=FeatStruct
|
||||
)
|
||||
found = found.rename_variables(used_vars=used_vars)
|
||||
|
||||
result = unify(_next, found, bindings, rename_vars=False)
|
||||
if result is None:
|
||||
continue
|
||||
|
||||
new_edge = FeatureTreeEdge.from_production(
|
||||
prod, edge.start()
|
||||
).move_dot_forward(edge.end(), bindings)
|
||||
if chart.insert(new_edge, (edge,)):
|
||||
yield new_edge
|
||||
|
||||
|
||||
class FeatureEmptyPredictRule(EmptyPredictRule):
|
||||
def apply(self, chart, grammar):
|
||||
for prod in grammar.productions(empty=True):
|
||||
for index in range(chart.num_leaves() + 1):
|
||||
new_edge = FeatureTreeEdge.from_production(prod, index)
|
||||
if chart.insert(new_edge, ()):
|
||||
yield new_edge
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Feature Chart Parser
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
TD_FEATURE_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FeatureTopDownInitRule(),
|
||||
FeatureTopDownPredictRule(),
|
||||
FeatureSingleEdgeFundamentalRule(),
|
||||
]
|
||||
BU_FEATURE_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FeatureEmptyPredictRule(),
|
||||
FeatureBottomUpPredictRule(),
|
||||
FeatureSingleEdgeFundamentalRule(),
|
||||
]
|
||||
BU_LC_FEATURE_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FeatureEmptyPredictRule(),
|
||||
FeatureBottomUpPredictCombineRule(),
|
||||
FeatureSingleEdgeFundamentalRule(),
|
||||
]
|
||||
|
||||
|
||||
class FeatureChartParser(ChartParser):
|
||||
def __init__(
|
||||
self,
|
||||
grammar,
|
||||
strategy=BU_LC_FEATURE_STRATEGY,
|
||||
trace_chart_width=20,
|
||||
chart_class=FeatureChart,
|
||||
**parser_args,
|
||||
):
|
||||
ChartParser.__init__(
|
||||
self,
|
||||
grammar,
|
||||
strategy=strategy,
|
||||
trace_chart_width=trace_chart_width,
|
||||
chart_class=chart_class,
|
||||
**parser_args,
|
||||
)
|
||||
|
||||
|
||||
class FeatureTopDownChartParser(FeatureChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
FeatureChartParser.__init__(self, grammar, TD_FEATURE_STRATEGY, **parser_args)
|
||||
|
||||
|
||||
class FeatureBottomUpChartParser(FeatureChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
FeatureChartParser.__init__(self, grammar, BU_FEATURE_STRATEGY, **parser_args)
|
||||
|
||||
|
||||
class FeatureBottomUpLeftCornerChartParser(FeatureChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
FeatureChartParser.__init__(
|
||||
self, grammar, BU_LC_FEATURE_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Instantiate Variable Chart
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class InstantiateVarsChart(FeatureChart):
|
||||
"""
|
||||
A specialized chart that 'instantiates' variables whose names
|
||||
start with '@', by replacing them with unique new variables.
|
||||
In particular, whenever a complete edge is added to the chart, any
|
||||
variables in the edge's ``lhs`` whose names start with '@' will be
|
||||
replaced by unique new ``Variable``.
|
||||
"""
|
||||
|
||||
def __init__(self, tokens):
|
||||
FeatureChart.__init__(self, tokens)
|
||||
|
||||
def initialize(self):
|
||||
self._instantiated = set()
|
||||
FeatureChart.initialize(self)
|
||||
|
||||
def insert(self, edge, child_pointer_list):
|
||||
if edge in self._instantiated:
|
||||
return False
|
||||
self.instantiate_edge(edge)
|
||||
return FeatureChart.insert(self, edge, child_pointer_list)
|
||||
|
||||
def instantiate_edge(self, edge):
|
||||
"""
|
||||
If the edge is a ``FeatureTreeEdge``, and it is complete,
|
||||
then instantiate all variables whose names start with '@',
|
||||
by replacing them with unique new variables.
|
||||
|
||||
Note that instantiation is done in-place, since the
|
||||
parsing algorithms might already hold a reference to
|
||||
the edge for future use.
|
||||
"""
|
||||
# If the edge is a leaf, or is not complete, or is
|
||||
# already in the chart, then just return it as-is.
|
||||
if not isinstance(edge, FeatureTreeEdge):
|
||||
return
|
||||
if not edge.is_complete():
|
||||
return
|
||||
if edge in self._edge_to_cpls:
|
||||
return
|
||||
|
||||
# Get a list of variables that need to be instantiated.
|
||||
# If there are none, then return as-is.
|
||||
inst_vars = self.inst_vars(edge)
|
||||
if not inst_vars:
|
||||
return
|
||||
|
||||
# Instantiate the edge!
|
||||
self._instantiated.add(edge)
|
||||
edge._lhs = edge.lhs().substitute_bindings(inst_vars)
|
||||
|
||||
def inst_vars(self, edge):
|
||||
return {
|
||||
var: logic.unique_variable()
|
||||
for var in edge.lhs().variables()
|
||||
if var.name.startswith("@")
|
||||
}
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Demo
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo_grammar():
|
||||
from nltk.grammar import FeatureGrammar
|
||||
|
||||
return FeatureGrammar.fromstring(
|
||||
"""
|
||||
S -> NP VP
|
||||
PP -> Prep NP
|
||||
NP -> NP PP
|
||||
VP -> VP PP
|
||||
VP -> Verb NP
|
||||
VP -> Verb
|
||||
NP -> Det[pl=?x] Noun[pl=?x]
|
||||
NP -> "John"
|
||||
NP -> "I"
|
||||
Det -> "the"
|
||||
Det -> "my"
|
||||
Det[-pl] -> "a"
|
||||
Noun[-pl] -> "dog"
|
||||
Noun[-pl] -> "cookie"
|
||||
Verb -> "ate"
|
||||
Verb -> "saw"
|
||||
Prep -> "with"
|
||||
Prep -> "under"
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def demo(
|
||||
print_times=True,
|
||||
print_grammar=True,
|
||||
print_trees=True,
|
||||
print_sentence=True,
|
||||
trace=1,
|
||||
parser=FeatureChartParser,
|
||||
sent="I saw John with a dog with my cookie",
|
||||
):
|
||||
import sys
|
||||
import time
|
||||
|
||||
print()
|
||||
grammar = demo_grammar()
|
||||
if print_grammar:
|
||||
print(grammar)
|
||||
print()
|
||||
print("*", parser.__name__)
|
||||
if print_sentence:
|
||||
print("Sentence:", sent)
|
||||
tokens = sent.split()
|
||||
t = perf_counter()
|
||||
cp = parser(grammar, trace=trace)
|
||||
chart = cp.chart_parse(tokens)
|
||||
trees = list(chart.parses(grammar.start()))
|
||||
if print_times:
|
||||
print("Time: %s" % (perf_counter() - t))
|
||||
if print_trees:
|
||||
for tree in trees:
|
||||
print(tree)
|
||||
else:
|
||||
print("Nr trees:", len(trees))
|
||||
|
||||
|
||||
def run_profile():
|
||||
import profile
|
||||
|
||||
profile.run("for i in range(1): demo()", "/tmp/profile.out")
|
||||
import pstats
|
||||
|
||||
p = pstats.Stats("/tmp/profile.out")
|
||||
p.strip_dirs().sort_stats("time", "cum").print_stats(60)
|
||||
p.strip_dirs().sort_stats("cum", "time").print_stats(60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from nltk.data import load
|
||||
|
||||
demo()
|
||||
print()
|
||||
grammar = load("grammars/book_grammars/feat0.fcfg")
|
||||
cp = FeatureChartParser(grammar, trace=2)
|
||||
sent = "Kim likes children"
|
||||
tokens = sent.split()
|
||||
trees = cp.parse(tokens)
|
||||
for tree in trees:
|
||||
print(tree)
|
||||
88
backend/venv/Lib/site-packages/nltk/parse/generate.py
Normal file
88
backend/venv/Lib/site-packages/nltk/parse/generate.py
Normal file
@@ -0,0 +1,88 @@
|
||||
# Natural Language Toolkit: Generating from a CFG
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
|
||||
# Eric Kafe <kafe.eric@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
import itertools
|
||||
import sys
|
||||
|
||||
from nltk.grammar import Nonterminal
|
||||
|
||||
|
||||
def generate(grammar, start=None, depth=None, n=None):
|
||||
"""
|
||||
Generates an iterator of all sentences from a CFG.
|
||||
|
||||
:param grammar: The Grammar used to generate sentences.
|
||||
:param start: The Nonterminal from which to start generate sentences.
|
||||
:param depth: The maximal depth of the generated tree.
|
||||
:param n: The maximum number of sentences to return.
|
||||
:return: An iterator of lists of terminal tokens.
|
||||
"""
|
||||
if not start:
|
||||
start = grammar.start()
|
||||
if depth is None:
|
||||
# Safe default, assuming the grammar may be recursive:
|
||||
depth = (sys.getrecursionlimit() // 3) - 3
|
||||
|
||||
iter = _generate_all(grammar, [start], depth)
|
||||
|
||||
if n:
|
||||
iter = itertools.islice(iter, n)
|
||||
|
||||
return iter
|
||||
|
||||
|
||||
def _generate_all(grammar, items, depth):
|
||||
if items:
|
||||
try:
|
||||
for frag1 in _generate_one(grammar, items[0], depth):
|
||||
for frag2 in _generate_all(grammar, items[1:], depth):
|
||||
yield frag1 + frag2
|
||||
except RecursionError as error:
|
||||
# Helpful error message while still showing the recursion stack.
|
||||
raise RuntimeError(
|
||||
"The grammar has rule(s) that yield infinite recursion!\n\
|
||||
Eventually use a lower 'depth', or a higher 'sys.setrecursionlimit()'."
|
||||
) from error
|
||||
else:
|
||||
yield []
|
||||
|
||||
|
||||
def _generate_one(grammar, item, depth):
|
||||
if depth > 0:
|
||||
if isinstance(item, Nonterminal):
|
||||
for prod in grammar.productions(lhs=item):
|
||||
yield from _generate_all(grammar, prod.rhs(), depth - 1)
|
||||
else:
|
||||
yield [item]
|
||||
|
||||
|
||||
demo_grammar = """
|
||||
S -> NP VP
|
||||
NP -> Det N
|
||||
PP -> P NP
|
||||
VP -> 'slept' | 'saw' NP | 'walked' PP
|
||||
Det -> 'the' | 'a'
|
||||
N -> 'man' | 'park' | 'dog'
|
||||
P -> 'in' | 'with'
|
||||
"""
|
||||
|
||||
|
||||
def demo(N=23):
|
||||
from nltk.grammar import CFG
|
||||
|
||||
print("Generating the first %d sentences for demo grammar:" % (N,))
|
||||
print(demo_grammar)
|
||||
grammar = CFG.fromstring(demo_grammar)
|
||||
for n, sent in enumerate(generate(grammar, n=N), 1):
|
||||
print("%3d. %s" % (n, " ".join(sent)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
393
backend/venv/Lib/site-packages/nltk/parse/malt.py
Normal file
393
backend/venv/Lib/site-packages/nltk/parse/malt.py
Normal file
@@ -0,0 +1,393 @@
|
||||
# Natural Language Toolkit: Interface to MaltParser
|
||||
#
|
||||
# Author: Dan Garrette <dhgarrette@gmail.com>
|
||||
# Contributor: Liling Tan, Mustufain, osamamukhtar11
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import inspect
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
from nltk.data import ZipFilePathPointer
|
||||
from nltk.internals import find_dir, find_file, find_jars_within_path
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.parse.dependencygraph import DependencyGraph
|
||||
from nltk.parse.util import taggedsents_to_conll
|
||||
|
||||
|
||||
def malt_regex_tagger():
|
||||
from nltk.tag import RegexpTagger
|
||||
|
||||
_tagger = RegexpTagger(
|
||||
[
|
||||
(r"\.$", "."),
|
||||
(r"\,$", ","),
|
||||
(r"\?$", "?"), # fullstop, comma, Qmark
|
||||
(r"\($", "("),
|
||||
(r"\)$", ")"), # round brackets
|
||||
(r"\[$", "["),
|
||||
(r"\]$", "]"), # square brackets
|
||||
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
|
||||
(r"(The|the|A|a|An|an)$", "DT"), # articles
|
||||
(r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns
|
||||
(r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive
|
||||
(r"(my|Your|your|Yours|yours)$", "PRP$"), # possessive
|
||||
(r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions
|
||||
(r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions
|
||||
(r"(till|Till|until|Until)$", "IN"), # time prepopsitions
|
||||
(r"(by|By|beside|Beside)$", "IN"), # space prepopsitions
|
||||
(r"(under|Under|below|Below)$", "IN"), # space prepopsitions
|
||||
(r"(over|Over|above|Above)$", "IN"), # space prepopsitions
|
||||
(r"(across|Across|through|Through)$", "IN"), # space prepopsitions
|
||||
(r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions
|
||||
(r"(onto|Onto|from|From)$", "IN"), # space prepopsitions
|
||||
(r".*able$", "JJ"), # adjectives
|
||||
(r".*ness$", "NN"), # nouns formed from adjectives
|
||||
(r".*ly$", "RB"), # adverbs
|
||||
(r".*s$", "NNS"), # plural nouns
|
||||
(r".*ing$", "VBG"), # gerunds
|
||||
(r".*ed$", "VBD"), # past tense verbs
|
||||
(r".*", "NN"), # nouns (default)
|
||||
]
|
||||
)
|
||||
return _tagger.tag
|
||||
|
||||
|
||||
def find_maltparser(parser_dirname):
|
||||
"""
|
||||
A module to find MaltParser .jar file and its dependencies.
|
||||
"""
|
||||
if os.path.exists(parser_dirname): # If a full path is given.
|
||||
_malt_dir = parser_dirname
|
||||
else: # Try to find path to maltparser directory in environment variables.
|
||||
_malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",))
|
||||
# Checks that that the found directory contains all the necessary .jar
|
||||
malt_dependencies = ["", "", ""]
|
||||
_malt_jars = set(find_jars_within_path(_malt_dir))
|
||||
_jars = {os.path.split(jar)[1] for jar in _malt_jars}
|
||||
malt_dependencies = {"log4j.jar", "libsvm.jar", "liblinear-1.8.jar"}
|
||||
|
||||
assert malt_dependencies.issubset(_jars)
|
||||
assert any(
|
||||
filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars)
|
||||
)
|
||||
return list(_malt_jars)
|
||||
|
||||
|
||||
def find_malt_model(model_filename):
|
||||
"""
|
||||
A module to find pre-trained MaltParser model.
|
||||
"""
|
||||
if model_filename is None:
|
||||
return "malt_temp.mco"
|
||||
elif os.path.exists(model_filename): # If a full path is given.
|
||||
return model_filename
|
||||
else: # Try to find path to malt model in environment variables.
|
||||
return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False)
|
||||
|
||||
|
||||
class MaltParser(ParserI):
|
||||
"""
|
||||
A class for dependency parsing with MaltParser. The input is the paths to:
|
||||
- (optionally) a maltparser directory
|
||||
- (optionally) the path to a pre-trained MaltParser .mco model file
|
||||
- (optionally) the tagger to use for POS tagging before parsing
|
||||
- (optionally) additional Java arguments
|
||||
|
||||
Example:
|
||||
>>> from nltk.parse import malt
|
||||
>>> # With MALT_PARSER and MALT_MODEL environment set.
|
||||
>>> mp = malt.MaltParser(model_filename='engmalt.linear-1.7.mco') # doctest: +SKIP
|
||||
>>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
|
||||
(shot I (elephant an) (in (pajamas my)) .)
|
||||
>>> # Without MALT_PARSER and MALT_MODEL environment.
|
||||
>>> mp = malt.MaltParser('/home/user/maltparser-1.9.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP
|
||||
>>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
|
||||
(shot I (elephant an) (in (pajamas my)) .)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
parser_dirname="",
|
||||
model_filename=None,
|
||||
tagger=None,
|
||||
additional_java_args=None,
|
||||
):
|
||||
"""
|
||||
An interface for parsing with the Malt Parser.
|
||||
|
||||
:param parser_dirname: The path to the maltparser directory that
|
||||
contains the maltparser-1.x.jar
|
||||
:type parser_dirname: str
|
||||
:param model_filename: The name of the pre-trained model with .mco file
|
||||
extension. If provided, training will not be required.
|
||||
(see http://www.maltparser.org/mco/mco.html and
|
||||
see http://www.patful.com/chalk/node/185)
|
||||
:type model_filename: str
|
||||
:param tagger: The tagger used to POS tag the raw string before
|
||||
formatting to CONLL format. It should behave like `nltk.pos_tag`
|
||||
:type tagger: function
|
||||
:param additional_java_args: This is the additional Java arguments that
|
||||
one can use when calling Maltparser, usually this is the heapsize
|
||||
limits, e.g. `additional_java_args=['-Xmx1024m']`
|
||||
(see https://javarevisited.blogspot.com/2011/05/java-heap-space-memory-size-jvm.html)
|
||||
:type additional_java_args: list
|
||||
"""
|
||||
|
||||
# Find all the necessary jar files for MaltParser.
|
||||
self.malt_jars = find_maltparser(parser_dirname)
|
||||
# Initialize additional java arguments.
|
||||
self.additional_java_args = (
|
||||
additional_java_args if additional_java_args is not None else []
|
||||
)
|
||||
# Initialize model.
|
||||
self.model = find_malt_model(model_filename)
|
||||
self._trained = self.model != "malt_temp.mco"
|
||||
# Set the working_dir parameters i.e. `-w` from MaltParser's option.
|
||||
self.working_dir = tempfile.gettempdir()
|
||||
# Initialize POS tagger.
|
||||
self.tagger = tagger if tagger is not None else malt_regex_tagger()
|
||||
|
||||
def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"):
|
||||
"""
|
||||
Use MaltParser to parse multiple POS tagged sentences. Takes multiple
|
||||
sentences where each sentence is a list of (word, tag) tuples.
|
||||
The sentences must have already been tokenized and tagged.
|
||||
|
||||
:param sentences: Input sentences to parse
|
||||
:type sentence: list(list(tuple(str, str)))
|
||||
:return: iter(iter(``DependencyGraph``)) the dependency graph
|
||||
representation of each sentence
|
||||
"""
|
||||
if not self._trained:
|
||||
raise Exception("Parser has not been trained. Call train() first.")
|
||||
|
||||
with tempfile.NamedTemporaryFile(
|
||||
prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False
|
||||
) as input_file:
|
||||
with tempfile.NamedTemporaryFile(
|
||||
prefix="malt_output.conll.",
|
||||
dir=self.working_dir,
|
||||
mode="w",
|
||||
delete=False,
|
||||
) as output_file:
|
||||
# Convert list of sentences to CONLL format.
|
||||
for line in taggedsents_to_conll(sentences):
|
||||
input_file.write(str(line))
|
||||
input_file.close()
|
||||
|
||||
# Generate command to run maltparser.
|
||||
cmd = self.generate_malt_command(
|
||||
input_file.name, output_file.name, mode="parse"
|
||||
)
|
||||
|
||||
# This is a maltparser quirk, it needs to be run
|
||||
# where the model file is. otherwise it goes into an awkward
|
||||
# missing .jars or strange -w working_dir problem.
|
||||
_current_path = os.getcwd() # Remembers the current path.
|
||||
try: # Change to modelfile path
|
||||
os.chdir(os.path.split(self.model)[0])
|
||||
except:
|
||||
pass
|
||||
ret = self._execute(cmd, verbose) # Run command.
|
||||
os.chdir(_current_path) # Change back to current path.
|
||||
|
||||
if ret != 0:
|
||||
raise Exception(
|
||||
"MaltParser parsing (%s) failed with exit "
|
||||
"code %d" % (" ".join(cmd), ret)
|
||||
)
|
||||
|
||||
# Must return iter(iter(Tree))
|
||||
with open(output_file.name) as infile:
|
||||
for tree_str in infile.read().split("\n\n"):
|
||||
yield (
|
||||
iter(
|
||||
[
|
||||
DependencyGraph(
|
||||
tree_str, top_relation_label=top_relation_label
|
||||
)
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
os.remove(input_file.name)
|
||||
os.remove(output_file.name)
|
||||
|
||||
def parse_sents(self, sentences, verbose=False, top_relation_label="null"):
|
||||
"""
|
||||
Use MaltParser to parse multiple sentences.
|
||||
Takes a list of sentences, where each sentence is a list of words.
|
||||
Each sentence will be automatically tagged with this
|
||||
MaltParser instance's tagger.
|
||||
|
||||
:param sentences: Input sentences to parse
|
||||
:type sentence: list(list(str))
|
||||
:return: iter(DependencyGraph)
|
||||
"""
|
||||
tagged_sentences = (self.tagger(sentence) for sentence in sentences)
|
||||
return self.parse_tagged_sents(
|
||||
tagged_sentences, verbose, top_relation_label=top_relation_label
|
||||
)
|
||||
|
||||
def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
|
||||
"""
|
||||
This function generates the maltparser command use at the terminal.
|
||||
|
||||
:param inputfilename: path to the input file
|
||||
:type inputfilename: str
|
||||
:param outputfilename: path to the output file
|
||||
:type outputfilename: str
|
||||
"""
|
||||
|
||||
cmd = ["java"]
|
||||
cmd += self.additional_java_args # Adds additional java arguments
|
||||
# Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
|
||||
classpaths_separator = ";" if sys.platform.startswith("win") else ":"
|
||||
cmd += [
|
||||
"-cp",
|
||||
classpaths_separator.join(self.malt_jars),
|
||||
] # Adds classpaths for jars
|
||||
cmd += ["org.maltparser.Malt"] # Adds the main function.
|
||||
|
||||
# Adds the model file.
|
||||
if os.path.exists(self.model): # when parsing
|
||||
cmd += ["-c", os.path.split(self.model)[-1]]
|
||||
else: # when learning
|
||||
cmd += ["-c", self.model]
|
||||
|
||||
cmd += ["-i", inputfilename]
|
||||
if mode == "parse":
|
||||
cmd += ["-o", outputfilename]
|
||||
cmd += ["-m", mode] # mode use to generate parses.
|
||||
return cmd
|
||||
|
||||
@staticmethod
|
||||
def _execute(cmd, verbose=False):
|
||||
output = None if verbose else subprocess.PIPE
|
||||
p = subprocess.Popen(cmd, stdout=output, stderr=output)
|
||||
return p.wait()
|
||||
|
||||
def train(self, depgraphs, verbose=False):
|
||||
"""
|
||||
Train MaltParser from a list of ``DependencyGraph`` objects
|
||||
|
||||
:param depgraphs: list of ``DependencyGraph`` objects for training input data
|
||||
:type depgraphs: DependencyGraph
|
||||
"""
|
||||
|
||||
# Write the conll_str to malt_train.conll file in /tmp/
|
||||
with tempfile.NamedTemporaryFile(
|
||||
prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
|
||||
) as input_file:
|
||||
input_str = "\n".join(dg.to_conll(10) for dg in depgraphs)
|
||||
input_file.write(str(input_str))
|
||||
# Trains the model with the malt_train.conll
|
||||
self.train_from_file(input_file.name, verbose=verbose)
|
||||
# Removes the malt_train.conll once training finishes.
|
||||
os.remove(input_file.name)
|
||||
|
||||
def train_from_file(self, conll_file, verbose=False):
|
||||
"""
|
||||
Train MaltParser from a file
|
||||
:param conll_file: str for the filename of the training input data
|
||||
:type conll_file: str
|
||||
"""
|
||||
|
||||
# If conll_file is a ZipFilePathPointer,
|
||||
# then we need to do some extra massaging
|
||||
if isinstance(conll_file, ZipFilePathPointer):
|
||||
with tempfile.NamedTemporaryFile(
|
||||
prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
|
||||
) as input_file:
|
||||
with conll_file.open() as conll_input_file:
|
||||
conll_str = conll_input_file.read()
|
||||
input_file.write(str(conll_str))
|
||||
return self.train_from_file(input_file.name, verbose=verbose)
|
||||
|
||||
# Generate command to run maltparser.
|
||||
cmd = self.generate_malt_command(conll_file, mode="learn")
|
||||
ret = self._execute(cmd, verbose)
|
||||
if ret != 0:
|
||||
raise Exception(
|
||||
"MaltParser training (%s) failed with exit "
|
||||
"code %d" % (" ".join(cmd), ret)
|
||||
)
|
||||
self._trained = True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
A demonstration function to show how NLTK users can use the malt parser API.
|
||||
|
||||
>>> from nltk import pos_tag
|
||||
>>> assert 'MALT_PARSER' in os.environ, str(
|
||||
... "Please set MALT_PARSER in your global environment, e.g.:\n"
|
||||
... "$ export MALT_PARSER='/home/user/maltparser-1.9.2/'")
|
||||
>>>
|
||||
>>> assert 'MALT_MODEL' in os.environ, str(
|
||||
... "Please set MALT_MODEL in your global environment, e.g.:\n"
|
||||
... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'")
|
||||
>>>
|
||||
>>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
|
||||
... "2 sees _ VB _ _ 0 ROOT _ _\n"
|
||||
... "3 a _ DT _ _ 4 SPEC _ _\n"
|
||||
... "4 dog _ NN _ _ 2 OBJ _ _\n"
|
||||
... "5 . _ . _ _ 2 PUNCT _ _\n")
|
||||
>>>
|
||||
>>>
|
||||
>>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
|
||||
... "2 walks _ VB _ _ 0 ROOT _ _\n"
|
||||
... "3 . _ . _ _ 2 PUNCT _ _\n")
|
||||
>>> dg1 = DependencyGraph(_dg1_str)
|
||||
>>> dg2 = DependencyGraph(_dg2_str)
|
||||
>>> # Initialize a MaltParser object
|
||||
>>> mp = MaltParser()
|
||||
>>>
|
||||
>>> # Trains a model.
|
||||
>>> mp.train([dg1,dg2], verbose=False)
|
||||
>>> sent1 = ['John','sees','Mary', '.']
|
||||
>>> sent2 = ['John', 'walks', 'a', 'dog', '.']
|
||||
>>>
|
||||
>>> # Parse a single sentence.
|
||||
>>> parsed_sent1 = mp.parse_one(sent1)
|
||||
>>> parsed_sent2 = mp.parse_one(sent2)
|
||||
>>> print(parsed_sent1.tree())
|
||||
(sees John Mary .)
|
||||
>>> print(parsed_sent2.tree())
|
||||
(walks John (dog a) .)
|
||||
>>>
|
||||
>>> # Parsing multiple sentences.
|
||||
>>> sentences = [sent1,sent2]
|
||||
>>> parsed_sents = mp.parse_sents(sentences)
|
||||
>>> print(next(next(parsed_sents)).tree())
|
||||
(sees John Mary .)
|
||||
>>> print(next(next(parsed_sents)).tree())
|
||||
(walks John (dog a) .)
|
||||
>>>
|
||||
>>> # Initialize a MaltParser object with an English pre-trained model.
|
||||
>>> parser_dirname = 'maltparser-1.9.2'
|
||||
>>> model_name = 'engmalt.linear-1.7.mco'
|
||||
>>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)
|
||||
>>> sent1 = 'I shot an elephant in my pajamas .'.split()
|
||||
>>> sent2 = 'Time flies like banana .'.split()
|
||||
>>> # Parse a single sentence.
|
||||
>>> print(mp.parse_one(sent1).tree())
|
||||
(shot I (elephant an) (in (pajamas my)) .)
|
||||
# Parsing multiple sentences
|
||||
>>> sentences = [sent1,sent2]
|
||||
>>> parsed_sents = mp.parse_sents(sentences)
|
||||
>>> print(next(next(parsed_sents)).tree())
|
||||
(shot I (elephant an) (in (pajamas my)) .)
|
||||
>>> print(next(next(parsed_sents)).tree())
|
||||
(flies Time (like banana) .)
|
||||
"""
|
||||
|
||||
import doctest
|
||||
|
||||
doctest.testmod()
|
||||
@@ -0,0 +1,772 @@
|
||||
# Natural Language Toolkit: Dependency Grammars
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Jason Narad <jason.narad@gmail.com>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
import logging
|
||||
import math
|
||||
|
||||
from nltk.parse.dependencygraph import DependencyGraph
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
#################################################################
|
||||
# DependencyScorerI - Interface for Graph-Edge Weight Calculation
|
||||
#################################################################
|
||||
|
||||
|
||||
class DependencyScorerI:
|
||||
"""
|
||||
A scorer for calculated the weights on the edges of a weighted
|
||||
dependency graph. This is used by a
|
||||
``ProbabilisticNonprojectiveParser`` to initialize the edge
|
||||
weights of a ``DependencyGraph``. While typically this would be done
|
||||
by training a binary classifier, any class that can return a
|
||||
multidimensional list representation of the edge weights can
|
||||
implement this interface. As such, it has no necessary
|
||||
fields.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
if self.__class__ == DependencyScorerI:
|
||||
raise TypeError("DependencyScorerI is an abstract interface")
|
||||
|
||||
def train(self, graphs):
|
||||
"""
|
||||
:type graphs: list(DependencyGraph)
|
||||
:param graphs: A list of dependency graphs to train the scorer.
|
||||
Typically the edges present in the graphs can be used as
|
||||
positive training examples, and the edges not present as negative
|
||||
examples.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def score(self, graph):
|
||||
"""
|
||||
:type graph: DependencyGraph
|
||||
:param graph: A dependency graph whose set of edges need to be
|
||||
scored.
|
||||
:rtype: A three-dimensional list of numbers.
|
||||
:return: The score is returned in a multidimensional(3) list, such
|
||||
that the outer-dimension refers to the head, and the
|
||||
inner-dimension refers to the dependencies. For instance,
|
||||
scores[0][1] would reference the list of scores corresponding to
|
||||
arcs from node 0 to node 1. The node's 'address' field can be used
|
||||
to determine its number identification.
|
||||
|
||||
For further illustration, a score list corresponding to Fig.2 of
|
||||
Keith Hall's 'K-best Spanning Tree Parsing' paper::
|
||||
|
||||
scores = [[[], [5], [1], [1]],
|
||||
[[], [], [11], [4]],
|
||||
[[], [10], [], [5]],
|
||||
[[], [8], [8], []]]
|
||||
|
||||
When used in conjunction with a MaxEntClassifier, each score would
|
||||
correspond to the confidence of a particular edge being classified
|
||||
with the positive training examples.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
#################################################################
|
||||
# NaiveBayesDependencyScorer
|
||||
#################################################################
|
||||
|
||||
|
||||
class NaiveBayesDependencyScorer(DependencyScorerI):
|
||||
"""
|
||||
A dependency scorer built around a MaxEnt classifier. In this
|
||||
particular class that classifier is a ``NaiveBayesClassifier``.
|
||||
It uses head-word, head-tag, child-word, and child-tag features
|
||||
for classification.
|
||||
|
||||
>>> from nltk.parse.dependencygraph import DependencyGraph, conll_data2
|
||||
|
||||
>>> graphs = [DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry]
|
||||
>>> npp = ProbabilisticNonprojectiveParser()
|
||||
>>> npp.train(graphs, NaiveBayesDependencyScorer())
|
||||
>>> parses = npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc'])
|
||||
>>> len(list(parses))
|
||||
1
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass # Do nothing without throwing error
|
||||
|
||||
def train(self, graphs):
|
||||
"""
|
||||
Trains a ``NaiveBayesClassifier`` using the edges present in
|
||||
graphs list as positive examples, the edges not present as
|
||||
negative examples. Uses a feature vector of head-word,
|
||||
head-tag, child-word, and child-tag.
|
||||
|
||||
:type graphs: list(DependencyGraph)
|
||||
:param graphs: A list of dependency graphs to train the scorer.
|
||||
"""
|
||||
|
||||
from nltk.classify import NaiveBayesClassifier
|
||||
|
||||
# Create training labeled training examples
|
||||
labeled_examples = []
|
||||
for graph in graphs:
|
||||
for head_node in graph.nodes.values():
|
||||
for child_index, child_node in graph.nodes.items():
|
||||
if child_index in head_node["deps"]:
|
||||
label = "T"
|
||||
else:
|
||||
label = "F"
|
||||
labeled_examples.append(
|
||||
(
|
||||
dict(
|
||||
a=head_node["word"],
|
||||
b=head_node["tag"],
|
||||
c=child_node["word"],
|
||||
d=child_node["tag"],
|
||||
),
|
||||
label,
|
||||
)
|
||||
)
|
||||
|
||||
self.classifier = NaiveBayesClassifier.train(labeled_examples)
|
||||
|
||||
def score(self, graph):
|
||||
"""
|
||||
Converts the graph into a feature-based representation of
|
||||
each edge, and then assigns a score to each based on the
|
||||
confidence of the classifier in assigning it to the
|
||||
positive label. Scores are returned in a multidimensional list.
|
||||
|
||||
:type graph: DependencyGraph
|
||||
:param graph: A dependency graph to score.
|
||||
:rtype: 3 dimensional list
|
||||
:return: Edge scores for the graph parameter.
|
||||
"""
|
||||
# Convert graph to feature representation
|
||||
edges = []
|
||||
for head_node in graph.nodes.values():
|
||||
for child_node in graph.nodes.values():
|
||||
edges.append(
|
||||
dict(
|
||||
a=head_node["word"],
|
||||
b=head_node["tag"],
|
||||
c=child_node["word"],
|
||||
d=child_node["tag"],
|
||||
)
|
||||
)
|
||||
|
||||
# Score edges
|
||||
edge_scores = []
|
||||
row = []
|
||||
count = 0
|
||||
for pdist in self.classifier.prob_classify_many(edges):
|
||||
logger.debug("%.4f %.4f", pdist.prob("T"), pdist.prob("F"))
|
||||
# smoothing in case the probability = 0
|
||||
row.append([math.log(pdist.prob("T") + 0.00000000001)])
|
||||
count += 1
|
||||
if count == len(graph.nodes):
|
||||
edge_scores.append(row)
|
||||
row = []
|
||||
count = 0
|
||||
return edge_scores
|
||||
|
||||
|
||||
#################################################################
|
||||
# A Scorer for Demo Purposes
|
||||
#################################################################
|
||||
# A short class necessary to show parsing example from paper
|
||||
class DemoScorer(DependencyScorerI):
|
||||
def train(self, graphs):
|
||||
print("Training...")
|
||||
|
||||
def score(self, graph):
|
||||
# scores for Keith Hall 'K-best Spanning Tree Parsing' paper
|
||||
return [
|
||||
[[], [5], [1], [1]],
|
||||
[[], [], [11], [4]],
|
||||
[[], [10], [], [5]],
|
||||
[[], [8], [8], []],
|
||||
]
|
||||
|
||||
|
||||
#################################################################
|
||||
# Non-Projective Probabilistic Parsing
|
||||
#################################################################
|
||||
|
||||
|
||||
class ProbabilisticNonprojectiveParser:
|
||||
"""A probabilistic non-projective dependency parser.
|
||||
|
||||
Nonprojective dependencies allows for "crossing branches" in the parse tree
|
||||
which is necessary for representing particular linguistic phenomena, or even
|
||||
typical parses in some languages. This parser follows the MST parsing
|
||||
algorithm, outlined in McDonald(2005), which likens the search for the best
|
||||
non-projective parse to finding the maximum spanning tree in a weighted
|
||||
directed graph.
|
||||
|
||||
>>> class Scorer(DependencyScorerI):
|
||||
... def train(self, graphs):
|
||||
... pass
|
||||
...
|
||||
... def score(self, graph):
|
||||
... return [
|
||||
... [[], [5], [1], [1]],
|
||||
... [[], [], [11], [4]],
|
||||
... [[], [10], [], [5]],
|
||||
... [[], [8], [8], []],
|
||||
... ]
|
||||
|
||||
|
||||
>>> npp = ProbabilisticNonprojectiveParser()
|
||||
>>> npp.train([], Scorer())
|
||||
|
||||
>>> parses = npp.parse(['v1', 'v2', 'v3'], [None, None, None])
|
||||
>>> len(list(parses))
|
||||
1
|
||||
|
||||
Rule based example
|
||||
|
||||
>>> from nltk.grammar import DependencyGrammar
|
||||
|
||||
>>> grammar = DependencyGrammar.fromstring('''
|
||||
... 'taught' -> 'play' | 'man'
|
||||
... 'man' -> 'the' | 'in'
|
||||
... 'in' -> 'corner'
|
||||
... 'corner' -> 'the'
|
||||
... 'play' -> 'golf' | 'dachshund' | 'to'
|
||||
... 'dachshund' -> 'his'
|
||||
... ''')
|
||||
|
||||
>>> ndp = NonprojectiveDependencyParser(grammar)
|
||||
>>> parses = ndp.parse(['the', 'man', 'in', 'the', 'corner', 'taught', 'his', 'dachshund', 'to', 'play', 'golf'])
|
||||
>>> len(list(parses))
|
||||
4
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Creates a new non-projective parser.
|
||||
"""
|
||||
logging.debug("initializing prob. nonprojective...")
|
||||
|
||||
def train(self, graphs, dependency_scorer):
|
||||
"""
|
||||
Trains a ``DependencyScorerI`` from a set of ``DependencyGraph`` objects,
|
||||
and establishes this as the parser's scorer. This is used to
|
||||
initialize the scores on a ``DependencyGraph`` during the parsing
|
||||
procedure.
|
||||
|
||||
:type graphs: list(DependencyGraph)
|
||||
:param graphs: A list of dependency graphs to train the scorer.
|
||||
:type dependency_scorer: DependencyScorerI
|
||||
:param dependency_scorer: A scorer which implements the
|
||||
``DependencyScorerI`` interface.
|
||||
"""
|
||||
self._scorer = dependency_scorer
|
||||
self._scorer.train(graphs)
|
||||
|
||||
def initialize_edge_scores(self, graph):
|
||||
"""
|
||||
Assigns a score to every edge in the ``DependencyGraph`` graph.
|
||||
These scores are generated via the parser's scorer which
|
||||
was assigned during the training process.
|
||||
|
||||
:type graph: DependencyGraph
|
||||
:param graph: A dependency graph to assign scores to.
|
||||
"""
|
||||
self.scores = self._scorer.score(graph)
|
||||
|
||||
def collapse_nodes(self, new_node, cycle_path, g_graph, b_graph, c_graph):
|
||||
"""
|
||||
Takes a list of nodes that have been identified to belong to a cycle,
|
||||
and collapses them into on larger node. The arcs of all nodes in
|
||||
the graph must be updated to account for this.
|
||||
|
||||
:type new_node: Node.
|
||||
:param new_node: A Node (Dictionary) to collapse the cycle nodes into.
|
||||
:type cycle_path: A list of integers.
|
||||
:param cycle_path: A list of node addresses, each of which is in the cycle.
|
||||
:type g_graph, b_graph, c_graph: DependencyGraph
|
||||
:param g_graph, b_graph, c_graph: Graphs which need to be updated.
|
||||
"""
|
||||
logger.debug("Collapsing nodes...")
|
||||
# Collapse all cycle nodes into v_n+1 in G_Graph
|
||||
for cycle_node_index in cycle_path:
|
||||
g_graph.remove_by_address(cycle_node_index)
|
||||
g_graph.add_node(new_node)
|
||||
g_graph.redirect_arcs(cycle_path, new_node["address"])
|
||||
|
||||
def update_edge_scores(self, new_node, cycle_path):
|
||||
"""
|
||||
Updates the edge scores to reflect a collapse operation into
|
||||
new_node.
|
||||
|
||||
:type new_node: A Node.
|
||||
:param new_node: The node which cycle nodes are collapsed into.
|
||||
:type cycle_path: A list of integers.
|
||||
:param cycle_path: A list of node addresses that belong to the cycle.
|
||||
"""
|
||||
logger.debug("cycle %s", cycle_path)
|
||||
|
||||
cycle_path = self.compute_original_indexes(cycle_path)
|
||||
|
||||
logger.debug("old cycle %s", cycle_path)
|
||||
logger.debug("Prior to update: %s", self.scores)
|
||||
|
||||
for i, row in enumerate(self.scores):
|
||||
for j, column in enumerate(self.scores[i]):
|
||||
logger.debug(self.scores[i][j])
|
||||
if j in cycle_path and i not in cycle_path and self.scores[i][j]:
|
||||
subtract_val = self.compute_max_subtract_score(j, cycle_path)
|
||||
|
||||
logger.debug("%s - %s", self.scores[i][j], subtract_val)
|
||||
|
||||
new_vals = []
|
||||
for cur_val in self.scores[i][j]:
|
||||
new_vals.append(cur_val - subtract_val)
|
||||
|
||||
self.scores[i][j] = new_vals
|
||||
|
||||
for i, row in enumerate(self.scores):
|
||||
for j, cell in enumerate(self.scores[i]):
|
||||
if i in cycle_path and j in cycle_path:
|
||||
self.scores[i][j] = []
|
||||
|
||||
logger.debug("After update: %s", self.scores)
|
||||
|
||||
def compute_original_indexes(self, new_indexes):
|
||||
"""
|
||||
As nodes are collapsed into others, they are replaced
|
||||
by the new node in the graph, but it's still necessary
|
||||
to keep track of what these original nodes were. This
|
||||
takes a list of node addresses and replaces any collapsed
|
||||
node addresses with their original addresses.
|
||||
|
||||
:type new_indexes: A list of integers.
|
||||
:param new_indexes: A list of node addresses to check for
|
||||
subsumed nodes.
|
||||
"""
|
||||
swapped = True
|
||||
while swapped:
|
||||
originals = []
|
||||
swapped = False
|
||||
for new_index in new_indexes:
|
||||
if new_index in self.inner_nodes:
|
||||
for old_val in self.inner_nodes[new_index]:
|
||||
if old_val not in originals:
|
||||
originals.append(old_val)
|
||||
swapped = True
|
||||
else:
|
||||
originals.append(new_index)
|
||||
new_indexes = originals
|
||||
return new_indexes
|
||||
|
||||
def compute_max_subtract_score(self, column_index, cycle_indexes):
|
||||
"""
|
||||
When updating scores the score of the highest-weighted incoming
|
||||
arc is subtracted upon collapse. This returns the correct
|
||||
amount to subtract from that edge.
|
||||
|
||||
:type column_index: integer.
|
||||
:param column_index: A index representing the column of incoming arcs
|
||||
to a particular node being updated
|
||||
:type cycle_indexes: A list of integers.
|
||||
:param cycle_indexes: Only arcs from cycle nodes are considered. This
|
||||
is a list of such nodes addresses.
|
||||
"""
|
||||
max_score = -100000
|
||||
for row_index in cycle_indexes:
|
||||
for subtract_val in self.scores[row_index][column_index]:
|
||||
if subtract_val > max_score:
|
||||
max_score = subtract_val
|
||||
return max_score
|
||||
|
||||
def best_incoming_arc(self, node_index):
|
||||
"""
|
||||
Returns the source of the best incoming arc to the
|
||||
node with address: node_index
|
||||
|
||||
:type node_index: integer.
|
||||
:param node_index: The address of the 'destination' node,
|
||||
the node that is arced to.
|
||||
"""
|
||||
originals = self.compute_original_indexes([node_index])
|
||||
logger.debug("originals: %s", originals)
|
||||
|
||||
max_arc = None
|
||||
max_score = None
|
||||
for row_index in range(len(self.scores)):
|
||||
for col_index in range(len(self.scores[row_index])):
|
||||
if col_index in originals and (
|
||||
max_score is None or self.scores[row_index][col_index] > max_score
|
||||
):
|
||||
max_score = self.scores[row_index][col_index]
|
||||
max_arc = row_index
|
||||
logger.debug("%s, %s", row_index, col_index)
|
||||
|
||||
logger.debug(max_score)
|
||||
|
||||
for key in self.inner_nodes:
|
||||
replaced_nodes = self.inner_nodes[key]
|
||||
if max_arc in replaced_nodes:
|
||||
return key
|
||||
|
||||
return max_arc
|
||||
|
||||
def original_best_arc(self, node_index):
|
||||
originals = self.compute_original_indexes([node_index])
|
||||
max_arc = None
|
||||
max_score = None
|
||||
max_orig = None
|
||||
for row_index in range(len(self.scores)):
|
||||
for col_index in range(len(self.scores[row_index])):
|
||||
if col_index in originals and (
|
||||
max_score is None or self.scores[row_index][col_index] > max_score
|
||||
):
|
||||
max_score = self.scores[row_index][col_index]
|
||||
max_arc = row_index
|
||||
max_orig = col_index
|
||||
return [max_arc, max_orig]
|
||||
|
||||
def parse(self, tokens, tags):
|
||||
"""
|
||||
Parses a list of tokens in accordance to the MST parsing algorithm
|
||||
for non-projective dependency parses. Assumes that the tokens to
|
||||
be parsed have already been tagged and those tags are provided. Various
|
||||
scoring methods can be used by implementing the ``DependencyScorerI``
|
||||
interface and passing it to the training algorithm.
|
||||
|
||||
:type tokens: list(str)
|
||||
:param tokens: A list of words or punctuation to be parsed.
|
||||
:type tags: list(str)
|
||||
:param tags: A list of tags corresponding by index to the words in the tokens list.
|
||||
:return: An iterator of non-projective parses.
|
||||
:rtype: iter(DependencyGraph)
|
||||
"""
|
||||
self.inner_nodes = {}
|
||||
|
||||
# Initialize g_graph
|
||||
g_graph = DependencyGraph()
|
||||
for index, token in enumerate(tokens):
|
||||
g_graph.nodes[index + 1].update(
|
||||
{"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
|
||||
)
|
||||
|
||||
# Fully connect non-root nodes in g_graph
|
||||
g_graph.connect_graph()
|
||||
original_graph = DependencyGraph()
|
||||
for index, token in enumerate(tokens):
|
||||
original_graph.nodes[index + 1].update(
|
||||
{"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
|
||||
)
|
||||
|
||||
b_graph = DependencyGraph()
|
||||
c_graph = DependencyGraph()
|
||||
|
||||
for index, token in enumerate(tokens):
|
||||
c_graph.nodes[index + 1].update(
|
||||
{"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
|
||||
)
|
||||
|
||||
# Assign initial scores to g_graph edges
|
||||
self.initialize_edge_scores(g_graph)
|
||||
logger.debug(self.scores)
|
||||
# Initialize a list of unvisited vertices (by node address)
|
||||
unvisited_vertices = [vertex["address"] for vertex in c_graph.nodes.values()]
|
||||
# Iterate over unvisited vertices
|
||||
nr_vertices = len(tokens)
|
||||
betas = {}
|
||||
while unvisited_vertices:
|
||||
# Mark current node as visited
|
||||
current_vertex = unvisited_vertices.pop(0)
|
||||
logger.debug("current_vertex: %s", current_vertex)
|
||||
# Get corresponding node n_i to vertex v_i
|
||||
current_node = g_graph.get_by_address(current_vertex)
|
||||
logger.debug("current_node: %s", current_node)
|
||||
# Get best in-edge node b for current node
|
||||
best_in_edge = self.best_incoming_arc(current_vertex)
|
||||
betas[current_vertex] = self.original_best_arc(current_vertex)
|
||||
logger.debug("best in arc: %s --> %s", best_in_edge, current_vertex)
|
||||
# b_graph = Union(b_graph, b)
|
||||
for new_vertex in [current_vertex, best_in_edge]:
|
||||
b_graph.nodes[new_vertex].update(
|
||||
{"word": "TEMP", "rel": "NTOP", "address": new_vertex}
|
||||
)
|
||||
b_graph.add_arc(best_in_edge, current_vertex)
|
||||
# Beta(current node) = b - stored for parse recovery
|
||||
# If b_graph contains a cycle, collapse it
|
||||
cycle_path = b_graph.contains_cycle()
|
||||
if cycle_path:
|
||||
# Create a new node v_n+1 with address = len(nodes) + 1
|
||||
new_node = {"word": "NONE", "rel": "NTOP", "address": nr_vertices + 1}
|
||||
# c_graph = Union(c_graph, v_n+1)
|
||||
c_graph.add_node(new_node)
|
||||
# Collapse all nodes in cycle C into v_n+1
|
||||
self.update_edge_scores(new_node, cycle_path)
|
||||
self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph)
|
||||
for cycle_index in cycle_path:
|
||||
c_graph.add_arc(new_node["address"], cycle_index)
|
||||
# self.replaced_by[cycle_index] = new_node['address']
|
||||
|
||||
self.inner_nodes[new_node["address"]] = cycle_path
|
||||
|
||||
# Add v_n+1 to list of unvisited vertices
|
||||
unvisited_vertices.insert(0, nr_vertices + 1)
|
||||
|
||||
# increment # of nodes counter
|
||||
nr_vertices += 1
|
||||
|
||||
# Remove cycle nodes from b_graph; B = B - cycle c
|
||||
for cycle_node_address in cycle_path:
|
||||
b_graph.remove_by_address(cycle_node_address)
|
||||
|
||||
logger.debug("g_graph: %s", g_graph)
|
||||
logger.debug("b_graph: %s", b_graph)
|
||||
logger.debug("c_graph: %s", c_graph)
|
||||
logger.debug("Betas: %s", betas)
|
||||
logger.debug("replaced nodes %s", self.inner_nodes)
|
||||
|
||||
# Recover parse tree
|
||||
logger.debug("Final scores: %s", self.scores)
|
||||
|
||||
logger.debug("Recovering parse...")
|
||||
for i in range(len(tokens) + 1, nr_vertices + 1):
|
||||
betas[betas[i][1]] = betas[i]
|
||||
|
||||
logger.debug("Betas: %s", betas)
|
||||
for node in original_graph.nodes.values():
|
||||
# TODO: It's dangerous to assume that deps it a dictionary
|
||||
# because it's a default dictionary. Ideally, here we should not
|
||||
# be concerned how dependencies are stored inside of a dependency
|
||||
# graph.
|
||||
node["deps"] = {}
|
||||
for i in range(1, len(tokens) + 1):
|
||||
original_graph.add_arc(betas[i][0], betas[i][1])
|
||||
|
||||
logger.debug("Done.")
|
||||
yield original_graph
|
||||
|
||||
|
||||
#################################################################
|
||||
# Rule-based Non-Projective Parser
|
||||
#################################################################
|
||||
|
||||
|
||||
class NonprojectiveDependencyParser:
|
||||
"""
|
||||
A non-projective, rule-based, dependency parser. This parser
|
||||
will return the set of all possible non-projective parses based on
|
||||
the word-to-word relations defined in the parser's dependency
|
||||
grammar, and will allow the branches of the parse tree to cross
|
||||
in order to capture a variety of linguistic phenomena that a
|
||||
projective parser will not.
|
||||
"""
|
||||
|
||||
def __init__(self, dependency_grammar):
|
||||
"""
|
||||
Creates a new ``NonprojectiveDependencyParser``.
|
||||
|
||||
:param dependency_grammar: a grammar of word-to-word relations.
|
||||
:type dependency_grammar: DependencyGrammar
|
||||
"""
|
||||
self._grammar = dependency_grammar
|
||||
|
||||
def parse(self, tokens):
|
||||
"""
|
||||
Parses the input tokens with respect to the parser's grammar. Parsing
|
||||
is accomplished by representing the search-space of possible parses as
|
||||
a fully-connected directed graph. Arcs that would lead to ungrammatical
|
||||
parses are removed and a lattice is constructed of length n, where n is
|
||||
the number of input tokens, to represent all possible grammatical
|
||||
traversals. All possible paths through the lattice are then enumerated
|
||||
to produce the set of non-projective parses.
|
||||
|
||||
param tokens: A list of tokens to parse.
|
||||
type tokens: list(str)
|
||||
return: An iterator of non-projective parses.
|
||||
rtype: iter(DependencyGraph)
|
||||
"""
|
||||
# Create graph representation of tokens
|
||||
self._graph = DependencyGraph()
|
||||
|
||||
for index, token in enumerate(tokens):
|
||||
self._graph.nodes[index] = {
|
||||
"word": token,
|
||||
"deps": [],
|
||||
"rel": "NTOP",
|
||||
"address": index,
|
||||
}
|
||||
|
||||
for head_node in self._graph.nodes.values():
|
||||
deps = []
|
||||
for dep_node in self._graph.nodes.values():
|
||||
if (
|
||||
self._grammar.contains(head_node["word"], dep_node["word"])
|
||||
and head_node["word"] != dep_node["word"]
|
||||
):
|
||||
deps.append(dep_node["address"])
|
||||
head_node["deps"] = deps
|
||||
|
||||
# Create lattice of possible heads
|
||||
roots = []
|
||||
possible_heads = []
|
||||
for i, word in enumerate(tokens):
|
||||
heads = []
|
||||
for j, head in enumerate(tokens):
|
||||
if (i != j) and self._grammar.contains(head, word):
|
||||
heads.append(j)
|
||||
if len(heads) == 0:
|
||||
roots.append(i)
|
||||
possible_heads.append(heads)
|
||||
|
||||
# Set roots to attempt
|
||||
if len(roots) < 2:
|
||||
if len(roots) == 0:
|
||||
for i in range(len(tokens)):
|
||||
roots.append(i)
|
||||
|
||||
# Traverse lattice
|
||||
analyses = []
|
||||
for _ in roots:
|
||||
stack = []
|
||||
analysis = [[] for i in range(len(possible_heads))]
|
||||
i = 0
|
||||
forward = True
|
||||
while i >= 0:
|
||||
if forward:
|
||||
if len(possible_heads[i]) == 1:
|
||||
analysis[i] = possible_heads[i][0]
|
||||
elif len(possible_heads[i]) == 0:
|
||||
analysis[i] = -1
|
||||
else:
|
||||
head = possible_heads[i].pop()
|
||||
analysis[i] = head
|
||||
stack.append([i, head])
|
||||
if not forward:
|
||||
index_on_stack = False
|
||||
for stack_item in stack:
|
||||
if stack_item[0] == i:
|
||||
index_on_stack = True
|
||||
orig_length = len(possible_heads[i])
|
||||
|
||||
if index_on_stack and orig_length == 0:
|
||||
for j in range(len(stack) - 1, -1, -1):
|
||||
stack_item = stack[j]
|
||||
if stack_item[0] == i:
|
||||
possible_heads[i].append(stack.pop(j)[1])
|
||||
|
||||
elif index_on_stack and orig_length > 0:
|
||||
head = possible_heads[i].pop()
|
||||
analysis[i] = head
|
||||
stack.append([i, head])
|
||||
forward = True
|
||||
|
||||
if i + 1 == len(possible_heads):
|
||||
analyses.append(analysis[:])
|
||||
forward = False
|
||||
if forward:
|
||||
i += 1
|
||||
else:
|
||||
i -= 1
|
||||
|
||||
# Filter parses
|
||||
# ensure 1 root, every thing has 1 head
|
||||
for analysis in analyses:
|
||||
if analysis.count(-1) > 1:
|
||||
# there are several root elements!
|
||||
continue
|
||||
|
||||
graph = DependencyGraph()
|
||||
graph.root = graph.nodes[analysis.index(-1) + 1]
|
||||
|
||||
for address, (token, head_index) in enumerate(
|
||||
zip(tokens, analysis), start=1
|
||||
):
|
||||
head_address = head_index + 1
|
||||
|
||||
node = graph.nodes[address]
|
||||
node.update({"word": token, "address": address})
|
||||
|
||||
if head_address == 0:
|
||||
rel = "ROOT"
|
||||
else:
|
||||
rel = ""
|
||||
graph.nodes[head_index + 1]["deps"][rel].append(address)
|
||||
|
||||
# TODO: check for cycles
|
||||
yield graph
|
||||
|
||||
|
||||
#################################################################
|
||||
# Demos
|
||||
#################################################################
|
||||
|
||||
|
||||
def demo():
|
||||
# hall_demo()
|
||||
nonprojective_conll_parse_demo()
|
||||
rule_based_demo()
|
||||
|
||||
|
||||
def hall_demo():
|
||||
npp = ProbabilisticNonprojectiveParser()
|
||||
npp.train([], DemoScorer())
|
||||
for parse_graph in npp.parse(["v1", "v2", "v3"], [None, None, None]):
|
||||
print(parse_graph)
|
||||
|
||||
|
||||
def nonprojective_conll_parse_demo():
|
||||
from nltk.parse.dependencygraph import conll_data2
|
||||
|
||||
graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
|
||||
npp = ProbabilisticNonprojectiveParser()
|
||||
npp.train(graphs, NaiveBayesDependencyScorer())
|
||||
for parse_graph in npp.parse(
|
||||
["Cathy", "zag", "hen", "zwaaien", "."], ["N", "V", "Pron", "Adj", "N", "Punc"]
|
||||
):
|
||||
print(parse_graph)
|
||||
|
||||
|
||||
def rule_based_demo():
|
||||
from nltk.grammar import DependencyGrammar
|
||||
|
||||
grammar = DependencyGrammar.fromstring(
|
||||
"""
|
||||
'taught' -> 'play' | 'man'
|
||||
'man' -> 'the' | 'in'
|
||||
'in' -> 'corner'
|
||||
'corner' -> 'the'
|
||||
'play' -> 'golf' | 'dachshund' | 'to'
|
||||
'dachshund' -> 'his'
|
||||
"""
|
||||
)
|
||||
print(grammar)
|
||||
ndp = NonprojectiveDependencyParser(grammar)
|
||||
graphs = ndp.parse(
|
||||
[
|
||||
"the",
|
||||
"man",
|
||||
"in",
|
||||
"the",
|
||||
"corner",
|
||||
"taught",
|
||||
"his",
|
||||
"dachshund",
|
||||
"to",
|
||||
"play",
|
||||
"golf",
|
||||
]
|
||||
)
|
||||
print("Graphs:")
|
||||
for graph in graphs:
|
||||
print(graph)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
579
backend/venv/Lib/site-packages/nltk/parse/pchart.py
Normal file
579
backend/venv/Lib/site-packages/nltk/parse/pchart.py
Normal file
@@ -0,0 +1,579 @@
|
||||
# Natural Language Toolkit: Probabilistic Chart Parsers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Classes and interfaces for associating probabilities with tree
|
||||
structures that represent the internal organization of a text. The
|
||||
probabilistic parser module defines ``BottomUpProbabilisticChartParser``.
|
||||
|
||||
``BottomUpProbabilisticChartParser`` is an abstract class that implements
|
||||
a bottom-up chart parser for ``PCFG`` grammars. It maintains a queue of edges,
|
||||
and adds them to the chart one at a time. The ordering of this queue
|
||||
is based on the probabilities associated with the edges, allowing the
|
||||
parser to expand more likely edges before less likely ones. Each
|
||||
subclass implements a different queue ordering, producing different
|
||||
search strategies. Currently the following subclasses are defined:
|
||||
|
||||
- ``InsideChartParser`` searches edges in decreasing order of
|
||||
their trees' inside probabilities.
|
||||
- ``RandomChartParser`` searches edges in random order.
|
||||
- ``LongestChartParser`` searches edges in decreasing order of their
|
||||
location's length.
|
||||
|
||||
The ``BottomUpProbabilisticChartParser`` constructor has an optional
|
||||
argument beam_size. If non-zero, this controls the size of the beam
|
||||
(aka the edge queue). This option is most useful with InsideChartParser.
|
||||
"""
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Bottom-Up PCFG Chart Parser
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
# [XX] This might not be implemented quite right -- it would be better
|
||||
# to associate probabilities with child pointer lists.
|
||||
|
||||
import random
|
||||
from functools import reduce
|
||||
|
||||
from nltk.grammar import PCFG, Nonterminal
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.parse.chart import AbstractChartRule, Chart, LeafEdge, TreeEdge
|
||||
from nltk.tree import ProbabilisticTree, Tree
|
||||
|
||||
|
||||
# Probabilistic edges
|
||||
class ProbabilisticLeafEdge(LeafEdge):
|
||||
def prob(self):
|
||||
return 1.0
|
||||
|
||||
|
||||
class ProbabilisticTreeEdge(TreeEdge):
|
||||
def __init__(self, prob, *args, **kwargs):
|
||||
TreeEdge.__init__(self, *args, **kwargs)
|
||||
self._prob = prob
|
||||
# two edges with different probabilities are not equal.
|
||||
self._comparison_key = (self._comparison_key, prob)
|
||||
|
||||
def prob(self):
|
||||
return self._prob
|
||||
|
||||
@staticmethod
|
||||
def from_production(production, index, p):
|
||||
return ProbabilisticTreeEdge(
|
||||
p, (index, index), production.lhs(), production.rhs(), 0
|
||||
)
|
||||
|
||||
|
||||
# Rules using probabilistic edges
|
||||
class ProbabilisticBottomUpInitRule(AbstractChartRule):
|
||||
NUM_EDGES = 0
|
||||
|
||||
def apply(self, chart, grammar):
|
||||
for index in range(chart.num_leaves()):
|
||||
new_edge = ProbabilisticLeafEdge(chart.leaf(index), index)
|
||||
if chart.insert(new_edge, ()):
|
||||
yield new_edge
|
||||
|
||||
|
||||
class ProbabilisticBottomUpPredictRule(AbstractChartRule):
|
||||
NUM_EDGES = 1
|
||||
|
||||
def apply(self, chart, grammar, edge):
|
||||
if edge.is_incomplete():
|
||||
return
|
||||
for prod in grammar.productions():
|
||||
if edge.lhs() == prod.rhs()[0]:
|
||||
new_edge = ProbabilisticTreeEdge.from_production(
|
||||
prod, edge.start(), prod.prob()
|
||||
)
|
||||
if chart.insert(new_edge, ()):
|
||||
yield new_edge
|
||||
|
||||
|
||||
class ProbabilisticFundamentalRule(AbstractChartRule):
|
||||
NUM_EDGES = 2
|
||||
|
||||
def apply(self, chart, grammar, left_edge, right_edge):
|
||||
# Make sure the rule is applicable.
|
||||
if not (
|
||||
left_edge.end() == right_edge.start()
|
||||
and left_edge.nextsym() == right_edge.lhs()
|
||||
and left_edge.is_incomplete()
|
||||
and right_edge.is_complete()
|
||||
):
|
||||
return
|
||||
|
||||
# Construct the new edge.
|
||||
p = left_edge.prob() * right_edge.prob()
|
||||
new_edge = ProbabilisticTreeEdge(
|
||||
p,
|
||||
span=(left_edge.start(), right_edge.end()),
|
||||
lhs=left_edge.lhs(),
|
||||
rhs=left_edge.rhs(),
|
||||
dot=left_edge.dot() + 1,
|
||||
)
|
||||
|
||||
# Add it to the chart, with appropriate child pointers.
|
||||
changed_chart = False
|
||||
for cpl1 in chart.child_pointer_lists(left_edge):
|
||||
if chart.insert(new_edge, cpl1 + (right_edge,)):
|
||||
changed_chart = True
|
||||
|
||||
# If we changed the chart, then generate the edge.
|
||||
if changed_chart:
|
||||
yield new_edge
|
||||
|
||||
|
||||
class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule):
|
||||
NUM_EDGES = 1
|
||||
|
||||
_fundamental_rule = ProbabilisticFundamentalRule()
|
||||
|
||||
def apply(self, chart, grammar, edge1):
|
||||
fr = self._fundamental_rule
|
||||
if edge1.is_incomplete():
|
||||
# edge1 = left_edge; edge2 = right_edge
|
||||
for edge2 in chart.select(
|
||||
start=edge1.end(), is_complete=True, lhs=edge1.nextsym()
|
||||
):
|
||||
yield from fr.apply(chart, grammar, edge1, edge2)
|
||||
else:
|
||||
# edge2 = left_edge; edge1 = right_edge
|
||||
for edge2 in chart.select(
|
||||
end=edge1.start(), is_complete=False, nextsym=edge1.lhs()
|
||||
):
|
||||
yield from fr.apply(chart, grammar, edge2, edge1)
|
||||
|
||||
def __str__(self):
|
||||
return "Fundamental Rule"
|
||||
|
||||
|
||||
class BottomUpProbabilisticChartParser(ParserI):
|
||||
"""
|
||||
An abstract bottom-up parser for ``PCFG`` grammars that uses a ``Chart`` to
|
||||
record partial results. ``BottomUpProbabilisticChartParser`` maintains
|
||||
a queue of edges that can be added to the chart. This queue is
|
||||
initialized with edges for each token in the text that is being
|
||||
parsed. ``BottomUpProbabilisticChartParser`` inserts these edges into
|
||||
the chart one at a time, starting with the most likely edges, and
|
||||
proceeding to less likely edges. For each edge that is added to
|
||||
the chart, it may become possible to insert additional edges into
|
||||
the chart; these are added to the queue. This process continues
|
||||
until enough complete parses have been generated, or until the
|
||||
queue is empty.
|
||||
|
||||
The sorting order for the queue is not specified by
|
||||
``BottomUpProbabilisticChartParser``. Different sorting orders will
|
||||
result in different search strategies. The sorting order for the
|
||||
queue is defined by the method ``sort_queue``; subclasses are required
|
||||
to provide a definition for this method.
|
||||
|
||||
:type _grammar: PCFG
|
||||
:ivar _grammar: The grammar used to parse sentences.
|
||||
:type _trace: int
|
||||
:ivar _trace: The level of tracing output that should be generated
|
||||
when parsing a text.
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, beam_size=0, trace=0):
|
||||
"""
|
||||
Create a new ``BottomUpProbabilisticChartParser``, that uses
|
||||
``grammar`` to parse texts.
|
||||
|
||||
:type grammar: PCFG
|
||||
:param grammar: The grammar used to parse texts.
|
||||
:type beam_size: int
|
||||
:param beam_size: The maximum length for the parser's edge queue.
|
||||
:type trace: int
|
||||
:param trace: The level of tracing that should be used when
|
||||
parsing a text. ``0`` will generate no tracing output;
|
||||
and higher numbers will produce more verbose tracing
|
||||
output.
|
||||
"""
|
||||
if not isinstance(grammar, PCFG):
|
||||
raise ValueError("The grammar must be probabilistic PCFG")
|
||||
self._grammar = grammar
|
||||
self.beam_size = beam_size
|
||||
self._trace = trace
|
||||
|
||||
def grammar(self):
|
||||
return self._grammar
|
||||
|
||||
def trace(self, trace=2):
|
||||
"""
|
||||
Set the level of tracing output that should be generated when
|
||||
parsing a text.
|
||||
|
||||
:type trace: int
|
||||
:param trace: The trace level. A trace level of ``0`` will
|
||||
generate no tracing output; and higher trace levels will
|
||||
produce more verbose tracing output.
|
||||
:rtype: None
|
||||
"""
|
||||
self._trace = trace
|
||||
|
||||
# TODO: change this to conform more with the standard ChartParser
|
||||
def parse(self, tokens):
|
||||
self._grammar.check_coverage(tokens)
|
||||
chart = Chart(list(tokens))
|
||||
grammar = self._grammar
|
||||
|
||||
# Chart parser rules.
|
||||
bu_init = ProbabilisticBottomUpInitRule()
|
||||
bu = ProbabilisticBottomUpPredictRule()
|
||||
fr = SingleEdgeProbabilisticFundamentalRule()
|
||||
|
||||
# Our queue
|
||||
queue = []
|
||||
|
||||
# Initialize the chart.
|
||||
for edge in bu_init.apply(chart, grammar):
|
||||
if self._trace > 1:
|
||||
print(
|
||||
" %-50s [%s]"
|
||||
% (chart.pretty_format_edge(edge, width=2), edge.prob())
|
||||
)
|
||||
queue.append(edge)
|
||||
|
||||
while len(queue) > 0:
|
||||
# Re-sort the queue.
|
||||
self.sort_queue(queue, chart)
|
||||
|
||||
# Prune the queue to the correct size if a beam was defined
|
||||
if self.beam_size:
|
||||
self._prune(queue, chart)
|
||||
|
||||
# Get the best edge.
|
||||
edge = queue.pop()
|
||||
if self._trace > 0:
|
||||
print(
|
||||
" %-50s [%s]"
|
||||
% (chart.pretty_format_edge(edge, width=2), edge.prob())
|
||||
)
|
||||
|
||||
# Apply BU & FR to it.
|
||||
queue.extend(bu.apply(chart, grammar, edge))
|
||||
queue.extend(fr.apply(chart, grammar, edge))
|
||||
|
||||
# Get a list of complete parses.
|
||||
parses = list(chart.parses(grammar.start(), ProbabilisticTree))
|
||||
|
||||
# Assign probabilities to the trees.
|
||||
prod_probs = {}
|
||||
for prod in grammar.productions():
|
||||
prod_probs[prod.lhs(), prod.rhs()] = prod.prob()
|
||||
for parse in parses:
|
||||
self._setprob(parse, prod_probs)
|
||||
|
||||
# Sort by probability
|
||||
parses.sort(reverse=True, key=lambda tree: tree.prob())
|
||||
|
||||
return iter(parses)
|
||||
|
||||
def _setprob(self, tree, prod_probs):
|
||||
if tree.prob() is not None:
|
||||
return
|
||||
|
||||
# Get the prob of the CFG production.
|
||||
lhs = Nonterminal(tree.label())
|
||||
rhs = []
|
||||
for child in tree:
|
||||
if isinstance(child, Tree):
|
||||
rhs.append(Nonterminal(child.label()))
|
||||
else:
|
||||
rhs.append(child)
|
||||
prob = prod_probs[lhs, tuple(rhs)]
|
||||
|
||||
# Get the probs of children.
|
||||
for child in tree:
|
||||
if isinstance(child, Tree):
|
||||
self._setprob(child, prod_probs)
|
||||
prob *= child.prob()
|
||||
|
||||
tree.set_prob(prob)
|
||||
|
||||
def sort_queue(self, queue, chart):
|
||||
"""
|
||||
Sort the given queue of ``Edge`` objects, placing the edge that should
|
||||
be tried first at the beginning of the queue. This method
|
||||
will be called after each ``Edge`` is added to the queue.
|
||||
|
||||
:param queue: The queue of ``Edge`` objects to sort. Each edge in
|
||||
this queue is an edge that could be added to the chart by
|
||||
the fundamental rule; but that has not yet been added.
|
||||
:type queue: list(Edge)
|
||||
:param chart: The chart being used to parse the text. This
|
||||
chart can be used to provide extra information for sorting
|
||||
the queue.
|
||||
:type chart: Chart
|
||||
:rtype: None
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def _prune(self, queue, chart):
|
||||
"""Discard items in the queue if the queue is longer than the beam."""
|
||||
if len(queue) > self.beam_size:
|
||||
split = len(queue) - self.beam_size
|
||||
if self._trace > 2:
|
||||
for edge in queue[:split]:
|
||||
print(" %-50s [DISCARDED]" % chart.pretty_format_edge(edge, 2))
|
||||
del queue[:split]
|
||||
|
||||
|
||||
class InsideChartParser(BottomUpProbabilisticChartParser):
|
||||
"""
|
||||
A bottom-up parser for ``PCFG`` grammars that tries edges in descending
|
||||
order of the inside probabilities of their trees. The "inside
|
||||
probability" of a tree is simply the
|
||||
probability of the entire tree, ignoring its context. In
|
||||
particular, the inside probability of a tree generated by
|
||||
production *p* with children *c[1], c[2], ..., c[n]* is
|
||||
*P(p)P(c[1])P(c[2])...P(c[n])*; and the inside
|
||||
probability of a token is 1 if it is present in the text, and 0 if
|
||||
it is absent.
|
||||
|
||||
This sorting order results in a type of lowest-cost-first search
|
||||
strategy.
|
||||
"""
|
||||
|
||||
# Inherit constructor.
|
||||
def sort_queue(self, queue, chart):
|
||||
"""
|
||||
Sort the given queue of edges, in descending order of the
|
||||
inside probabilities of the edges' trees.
|
||||
|
||||
:param queue: The queue of ``Edge`` objects to sort. Each edge in
|
||||
this queue is an edge that could be added to the chart by
|
||||
the fundamental rule; but that has not yet been added.
|
||||
:type queue: list(Edge)
|
||||
:param chart: The chart being used to parse the text. This
|
||||
chart can be used to provide extra information for sorting
|
||||
the queue.
|
||||
:type chart: Chart
|
||||
:rtype: None
|
||||
"""
|
||||
queue.sort(key=lambda edge: edge.prob())
|
||||
|
||||
|
||||
# Eventually, this will become some sort of inside-outside parser:
|
||||
# class InsideOutsideParser(BottomUpProbabilisticChartParser):
|
||||
# def __init__(self, grammar, trace=0):
|
||||
# # Inherit docs.
|
||||
# BottomUpProbabilisticChartParser.__init__(self, grammar, trace)
|
||||
#
|
||||
# # Find the best path from S to each nonterminal
|
||||
# bestp = {}
|
||||
# for production in grammar.productions(): bestp[production.lhs()]=0
|
||||
# bestp[grammar.start()] = 1.0
|
||||
#
|
||||
# for i in range(len(grammar.productions())):
|
||||
# for production in grammar.productions():
|
||||
# lhs = production.lhs()
|
||||
# for elt in production.rhs():
|
||||
# bestp[elt] = max(bestp[lhs]*production.prob(),
|
||||
# bestp.get(elt,0))
|
||||
#
|
||||
# self._bestp = bestp
|
||||
# for (k,v) in self._bestp.items(): print(k,v)
|
||||
#
|
||||
# def _sortkey(self, edge):
|
||||
# return edge.structure()[PROB] * self._bestp[edge.lhs()]
|
||||
#
|
||||
# def sort_queue(self, queue, chart):
|
||||
# queue.sort(key=self._sortkey)
|
||||
|
||||
|
||||
class RandomChartParser(BottomUpProbabilisticChartParser):
|
||||
"""
|
||||
A bottom-up parser for ``PCFG`` grammars that tries edges in random order.
|
||||
This sorting order results in a random search strategy.
|
||||
"""
|
||||
|
||||
# Inherit constructor
|
||||
def sort_queue(self, queue, chart):
|
||||
i = random.randint(0, len(queue) - 1)
|
||||
(queue[-1], queue[i]) = (queue[i], queue[-1])
|
||||
|
||||
|
||||
class UnsortedChartParser(BottomUpProbabilisticChartParser):
|
||||
"""
|
||||
A bottom-up parser for ``PCFG`` grammars that tries edges in whatever order.
|
||||
"""
|
||||
|
||||
# Inherit constructor
|
||||
def sort_queue(self, queue, chart):
|
||||
return
|
||||
|
||||
|
||||
class LongestChartParser(BottomUpProbabilisticChartParser):
|
||||
"""
|
||||
A bottom-up parser for ``PCFG`` grammars that tries longer edges before
|
||||
shorter ones. This sorting order results in a type of best-first
|
||||
search strategy.
|
||||
"""
|
||||
|
||||
# Inherit constructor
|
||||
def sort_queue(self, queue, chart):
|
||||
queue.sort(key=lambda edge: edge.length())
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Test Code
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo(choice=None, draw_parses=None, print_parses=None):
|
||||
"""
|
||||
A demonstration of the probabilistic parsers. The user is
|
||||
prompted to select which demo to run, and how many parses should
|
||||
be found; and then each parser is run on the same demo, and a
|
||||
summary of the results are displayed.
|
||||
"""
|
||||
import sys
|
||||
import time
|
||||
|
||||
from nltk import tokenize
|
||||
from nltk.parse import pchart
|
||||
|
||||
# Define two demos. Each demo has a sentence and a grammar.
|
||||
toy_pcfg1 = PCFG.fromstring(
|
||||
"""
|
||||
S -> NP VP [1.0]
|
||||
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
|
||||
Det -> 'the' [0.8] | 'my' [0.2]
|
||||
N -> 'man' [0.5] | 'telescope' [0.5]
|
||||
VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
|
||||
V -> 'ate' [0.35] | 'saw' [0.65]
|
||||
PP -> P NP [1.0]
|
||||
P -> 'with' [0.61] | 'under' [0.39]
|
||||
"""
|
||||
)
|
||||
|
||||
toy_pcfg2 = PCFG.fromstring(
|
||||
"""
|
||||
S -> NP VP [1.0]
|
||||
VP -> V NP [.59]
|
||||
VP -> V [.40]
|
||||
VP -> VP PP [.01]
|
||||
NP -> Det N [.41]
|
||||
NP -> Name [.28]
|
||||
NP -> NP PP [.31]
|
||||
PP -> P NP [1.0]
|
||||
V -> 'saw' [.21]
|
||||
V -> 'ate' [.51]
|
||||
V -> 'ran' [.28]
|
||||
N -> 'boy' [.11]
|
||||
N -> 'cookie' [.12]
|
||||
N -> 'table' [.13]
|
||||
N -> 'telescope' [.14]
|
||||
N -> 'hill' [.5]
|
||||
Name -> 'Jack' [.52]
|
||||
Name -> 'Bob' [.48]
|
||||
P -> 'with' [.61]
|
||||
P -> 'under' [.39]
|
||||
Det -> 'the' [.41]
|
||||
Det -> 'a' [.31]
|
||||
Det -> 'my' [.28]
|
||||
"""
|
||||
)
|
||||
|
||||
demos = [
|
||||
("I saw John with my telescope", toy_pcfg1),
|
||||
("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
|
||||
]
|
||||
|
||||
if choice is None:
|
||||
# Ask the user which demo they want to use.
|
||||
print()
|
||||
for i in range(len(demos)):
|
||||
print(f"{i + 1:>3}: {demos[i][0]}")
|
||||
print(" %r" % demos[i][1])
|
||||
print()
|
||||
print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
|
||||
choice = int(sys.stdin.readline().strip()) - 1
|
||||
try:
|
||||
sent, grammar = demos[choice]
|
||||
except:
|
||||
print("Bad sentence number")
|
||||
return
|
||||
|
||||
# Tokenize the sentence.
|
||||
tokens = sent.split()
|
||||
|
||||
# Define a list of parsers. We'll use all parsers.
|
||||
parsers = [
|
||||
pchart.InsideChartParser(grammar),
|
||||
pchart.RandomChartParser(grammar),
|
||||
pchart.UnsortedChartParser(grammar),
|
||||
pchart.LongestChartParser(grammar),
|
||||
pchart.InsideChartParser(grammar, beam_size=len(tokens) + 1), # was BeamParser
|
||||
]
|
||||
|
||||
# Run the parsers on the tokenized sentence.
|
||||
times = []
|
||||
average_p = []
|
||||
num_parses = []
|
||||
all_parses = {}
|
||||
for parser in parsers:
|
||||
print(f"\ns: {sent}\nparser: {parser}\ngrammar: {grammar}")
|
||||
parser.trace(3)
|
||||
t = time.time()
|
||||
parses = list(parser.parse(tokens))
|
||||
times.append(time.time() - t)
|
||||
p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
|
||||
average_p.append(p)
|
||||
num_parses.append(len(parses))
|
||||
for p in parses:
|
||||
all_parses[p.freeze()] = 1
|
||||
|
||||
# Print some summary statistics
|
||||
print()
|
||||
print(" Parser Beam | Time (secs) # Parses Average P(parse)")
|
||||
print("------------------------+------------------------------------------")
|
||||
for i in range(len(parsers)):
|
||||
print(
|
||||
"%18s %4d |%11.4f%11d%19.14f"
|
||||
% (
|
||||
parsers[i].__class__.__name__,
|
||||
parsers[i].beam_size,
|
||||
times[i],
|
||||
num_parses[i],
|
||||
average_p[i],
|
||||
)
|
||||
)
|
||||
parses = all_parses.keys()
|
||||
if parses:
|
||||
p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
|
||||
else:
|
||||
p = 0
|
||||
print("------------------------+------------------------------------------")
|
||||
print("%18s |%11s%11d%19.14f" % ("(All Parses)", "n/a", len(parses), p))
|
||||
|
||||
if draw_parses is None:
|
||||
# Ask the user if we should draw the parses.
|
||||
print()
|
||||
print("Draw parses (y/n)? ", end=" ")
|
||||
draw_parses = sys.stdin.readline().strip().lower().startswith("y")
|
||||
if draw_parses:
|
||||
from nltk.draw.tree import draw_trees
|
||||
|
||||
print(" please wait...")
|
||||
draw_trees(*parses)
|
||||
|
||||
if print_parses is None:
|
||||
# Ask the user if we should print the parses.
|
||||
print()
|
||||
print("Print parses (y/n)? ", end=" ")
|
||||
print_parses = sys.stdin.readline().strip().lower().startswith("y")
|
||||
if print_parses:
|
||||
for parse in parses:
|
||||
print(parse)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
@@ -0,0 +1,716 @@
|
||||
# Natural Language Toolkit: Dependency Grammars
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Jason Narad <jason.narad@gmail.com>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
from collections import defaultdict
|
||||
from functools import total_ordering
|
||||
from itertools import chain
|
||||
|
||||
from nltk.grammar import (
|
||||
DependencyGrammar,
|
||||
DependencyProduction,
|
||||
ProbabilisticDependencyGrammar,
|
||||
)
|
||||
from nltk.internals import raise_unorderable_types
|
||||
from nltk.parse.dependencygraph import DependencyGraph
|
||||
|
||||
#################################################################
|
||||
# Dependency Span
|
||||
#################################################################
|
||||
|
||||
|
||||
@total_ordering
|
||||
class DependencySpan:
|
||||
"""
|
||||
A contiguous span over some part of the input string representing
|
||||
dependency (head -> modifier) relationships amongst words. An atomic
|
||||
span corresponds to only one word so it isn't a 'span' in the conventional
|
||||
sense, as its _start_index = _end_index = _head_index for concatenation
|
||||
purposes. All other spans are assumed to have arcs between all nodes
|
||||
within the start and end indexes of the span, and one head index corresponding
|
||||
to the head word for the entire span. This is the same as the root node if
|
||||
the dependency structure were depicted as a graph.
|
||||
"""
|
||||
|
||||
def __init__(self, start_index, end_index, head_index, arcs, tags):
|
||||
self._start_index = start_index
|
||||
self._end_index = end_index
|
||||
self._head_index = head_index
|
||||
self._arcs = arcs
|
||||
self._tags = tags
|
||||
self._comparison_key = (start_index, end_index, head_index, tuple(arcs))
|
||||
self._hash = hash(self._comparison_key)
|
||||
|
||||
def head_index(self):
|
||||
"""
|
||||
:return: An value indexing the head of the entire ``DependencySpan``.
|
||||
:rtype: int
|
||||
"""
|
||||
return self._head_index
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
:return: A concise string representatino of the ``DependencySpan``.
|
||||
:rtype: str.
|
||||
"""
|
||||
return "Span %d-%d; Head Index: %d" % (
|
||||
self._start_index,
|
||||
self._end_index,
|
||||
self._head_index,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
:return: A verbose string representation of the ``DependencySpan``.
|
||||
:rtype: str
|
||||
"""
|
||||
str = "Span %d-%d; Head Index: %d" % (
|
||||
self._start_index,
|
||||
self._end_index,
|
||||
self._head_index,
|
||||
)
|
||||
for i in range(len(self._arcs)):
|
||||
str += "\n%d <- %d, %s" % (i, self._arcs[i], self._tags[i])
|
||||
return str
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
type(self) == type(other) and self._comparison_key == other._comparison_key
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __lt__(self, other):
|
||||
if not isinstance(other, DependencySpan):
|
||||
raise_unorderable_types("<", self, other)
|
||||
return self._comparison_key < other._comparison_key
|
||||
|
||||
def __hash__(self):
|
||||
"""
|
||||
:return: The hash value of this ``DependencySpan``.
|
||||
"""
|
||||
return self._hash
|
||||
|
||||
|
||||
#################################################################
|
||||
# Chart Cell
|
||||
#################################################################
|
||||
|
||||
|
||||
class ChartCell:
|
||||
"""
|
||||
A cell from the parse chart formed when performing the CYK algorithm.
|
||||
Each cell keeps track of its x and y coordinates (though this will probably
|
||||
be discarded), and a list of spans serving as the cell's entries.
|
||||
"""
|
||||
|
||||
def __init__(self, x, y):
|
||||
"""
|
||||
:param x: This cell's x coordinate.
|
||||
:type x: int.
|
||||
:param y: This cell's y coordinate.
|
||||
:type y: int.
|
||||
"""
|
||||
self._x = x
|
||||
self._y = y
|
||||
self._entries = set()
|
||||
|
||||
def add(self, span):
|
||||
"""
|
||||
Appends the given span to the list of spans
|
||||
representing the chart cell's entries.
|
||||
|
||||
:param span: The span to add.
|
||||
:type span: DependencySpan
|
||||
"""
|
||||
self._entries.add(span)
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
:return: A verbose string representation of this ``ChartCell``.
|
||||
:rtype: str.
|
||||
"""
|
||||
return "CC[%d,%d]: %s" % (self._x, self._y, self._entries)
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
:return: A concise string representation of this ``ChartCell``.
|
||||
:rtype: str.
|
||||
"""
|
||||
return "%s" % self
|
||||
|
||||
|
||||
#################################################################
|
||||
# Parsing with Dependency Grammars
|
||||
#################################################################
|
||||
|
||||
|
||||
class ProjectiveDependencyParser:
|
||||
"""
|
||||
A projective, rule-based, dependency parser. A ProjectiveDependencyParser
|
||||
is created with a DependencyGrammar, a set of productions specifying
|
||||
word-to-word dependency relations. The parse() method will then
|
||||
return the set of all parses, in tree representation, for a given input
|
||||
sequence of tokens. Each parse must meet the requirements of the both
|
||||
the grammar and the projectivity constraint which specifies that the
|
||||
branches of the dependency tree are not allowed to cross. Alternatively,
|
||||
this can be understood as stating that each parent node and its children
|
||||
in the parse tree form a continuous substring of the input sequence.
|
||||
"""
|
||||
|
||||
def __init__(self, dependency_grammar):
|
||||
"""
|
||||
Create a new ProjectiveDependencyParser, from a word-to-word
|
||||
dependency grammar ``DependencyGrammar``.
|
||||
|
||||
:param dependency_grammar: A word-to-word relation dependencygrammar.
|
||||
:type dependency_grammar: DependencyGrammar
|
||||
"""
|
||||
self._grammar = dependency_grammar
|
||||
|
||||
def parse(self, tokens):
|
||||
"""
|
||||
Performs a projective dependency parse on the list of tokens using
|
||||
a chart-based, span-concatenation algorithm similar to Eisner (1996).
|
||||
|
||||
:param tokens: The list of input tokens.
|
||||
:type tokens: list(str)
|
||||
:return: An iterator over parse trees.
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
self._tokens = list(tokens)
|
||||
chart = []
|
||||
for i in range(0, len(self._tokens) + 1):
|
||||
chart.append([])
|
||||
for j in range(0, len(self._tokens) + 1):
|
||||
chart[i].append(ChartCell(i, j))
|
||||
if i == j + 1:
|
||||
chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ["null"]))
|
||||
|
||||
for i in range(1, len(self._tokens) + 1):
|
||||
for j in range(i - 2, -1, -1):
|
||||
for k in range(i - 1, j, -1):
|
||||
for span1 in chart[k][j]._entries:
|
||||
for span2 in chart[i][k]._entries:
|
||||
for newspan in self.concatenate(span1, span2):
|
||||
chart[i][j].add(newspan)
|
||||
|
||||
for parse in chart[len(self._tokens)][0]._entries:
|
||||
conll_format = ""
|
||||
# malt_format = ""
|
||||
for i in range(len(tokens)):
|
||||
# malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null')
|
||||
# conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-')
|
||||
# Modify to comply with the new Dependency Graph requirement (at least must have an root elements)
|
||||
conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
|
||||
i + 1,
|
||||
tokens[i],
|
||||
tokens[i],
|
||||
"null",
|
||||
"null",
|
||||
"null",
|
||||
parse._arcs[i] + 1,
|
||||
"ROOT",
|
||||
"-",
|
||||
"-",
|
||||
)
|
||||
dg = DependencyGraph(conll_format)
|
||||
# if self.meets_arity(dg):
|
||||
yield dg.tree()
|
||||
|
||||
def concatenate(self, span1, span2):
|
||||
"""
|
||||
Concatenates the two spans in whichever way possible. This
|
||||
includes rightward concatenation (from the leftmost word of the
|
||||
leftmost span to the rightmost word of the rightmost span) and
|
||||
leftward concatenation (vice-versa) between adjacent spans. Unlike
|
||||
Eisner's presentation of span concatenation, these spans do not
|
||||
share or pivot on a particular word/word-index.
|
||||
|
||||
:return: A list of new spans formed through concatenation.
|
||||
:rtype: list(DependencySpan)
|
||||
"""
|
||||
spans = []
|
||||
if span1._start_index == span2._start_index:
|
||||
print("Error: Mismatched spans - replace this with thrown error")
|
||||
if span1._start_index > span2._start_index:
|
||||
temp_span = span1
|
||||
span1 = span2
|
||||
span2 = temp_span
|
||||
# adjacent rightward covered concatenation
|
||||
new_arcs = span1._arcs + span2._arcs
|
||||
new_tags = span1._tags + span2._tags
|
||||
if self._grammar.contains(
|
||||
self._tokens[span1._head_index], self._tokens[span2._head_index]
|
||||
):
|
||||
# print('Performing rightward cover %d to %d' % (span1._head_index, span2._head_index))
|
||||
new_arcs[span2._head_index - span1._start_index] = span1._head_index
|
||||
spans.append(
|
||||
DependencySpan(
|
||||
span1._start_index,
|
||||
span2._end_index,
|
||||
span1._head_index,
|
||||
new_arcs,
|
||||
new_tags,
|
||||
)
|
||||
)
|
||||
# adjacent leftward covered concatenation
|
||||
new_arcs = span1._arcs + span2._arcs
|
||||
if self._grammar.contains(
|
||||
self._tokens[span2._head_index], self._tokens[span1._head_index]
|
||||
):
|
||||
# print('performing leftward cover %d to %d' % (span2._head_index, span1._head_index))
|
||||
new_arcs[span1._head_index - span1._start_index] = span2._head_index
|
||||
spans.append(
|
||||
DependencySpan(
|
||||
span1._start_index,
|
||||
span2._end_index,
|
||||
span2._head_index,
|
||||
new_arcs,
|
||||
new_tags,
|
||||
)
|
||||
)
|
||||
return spans
|
||||
|
||||
|
||||
#################################################################
|
||||
# Parsing with Probabilistic Dependency Grammars
|
||||
#################################################################
|
||||
|
||||
|
||||
class ProbabilisticProjectiveDependencyParser:
|
||||
"""A probabilistic, projective dependency parser.
|
||||
|
||||
This parser returns the most probable projective parse derived from the
|
||||
probabilistic dependency grammar derived from the train() method. The
|
||||
probabilistic model is an implementation of Eisner's (1996) Model C, which
|
||||
conditions on head-word, head-tag, child-word, and child-tag. The decoding
|
||||
uses a bottom-up chart-based span concatenation algorithm that's identical
|
||||
to the one utilized by the rule-based projective parser.
|
||||
|
||||
Usage example
|
||||
|
||||
>>> from nltk.parse.dependencygraph import conll_data2
|
||||
|
||||
>>> graphs = [
|
||||
... DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry
|
||||
... ]
|
||||
|
||||
>>> ppdp = ProbabilisticProjectiveDependencyParser()
|
||||
>>> ppdp.train(graphs)
|
||||
|
||||
>>> sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.']
|
||||
>>> list(ppdp.parse(sent))
|
||||
[Tree('zag', ['Cathy', 'hen', Tree('zwaaien', ['wild', '.'])])]
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Create a new probabilistic dependency parser. No additional
|
||||
operations are necessary.
|
||||
"""
|
||||
|
||||
def parse(self, tokens):
|
||||
"""
|
||||
Parses the list of tokens subject to the projectivity constraint
|
||||
and the productions in the parser's grammar. This uses a method
|
||||
similar to the span-concatenation algorithm defined in Eisner (1996).
|
||||
It returns the most probable parse derived from the parser's
|
||||
probabilistic dependency grammar.
|
||||
"""
|
||||
self._tokens = list(tokens)
|
||||
chart = []
|
||||
for i in range(0, len(self._tokens) + 1):
|
||||
chart.append([])
|
||||
for j in range(0, len(self._tokens) + 1):
|
||||
chart[i].append(ChartCell(i, j))
|
||||
if i == j + 1:
|
||||
if tokens[i - 1] in self._grammar._tags:
|
||||
for tag in self._grammar._tags[tokens[i - 1]]:
|
||||
chart[i][j].add(
|
||||
DependencySpan(i - 1, i, i - 1, [-1], [tag])
|
||||
)
|
||||
else:
|
||||
print(
|
||||
"No tag found for input token '%s', parse is impossible."
|
||||
% tokens[i - 1]
|
||||
)
|
||||
return []
|
||||
for i in range(1, len(self._tokens) + 1):
|
||||
for j in range(i - 2, -1, -1):
|
||||
for k in range(i - 1, j, -1):
|
||||
for span1 in chart[k][j]._entries:
|
||||
for span2 in chart[i][k]._entries:
|
||||
for newspan in self.concatenate(span1, span2):
|
||||
chart[i][j].add(newspan)
|
||||
trees = []
|
||||
max_parse = None
|
||||
max_score = 0
|
||||
for parse in chart[len(self._tokens)][0]._entries:
|
||||
conll_format = ""
|
||||
malt_format = ""
|
||||
for i in range(len(tokens)):
|
||||
malt_format += "%s\t%s\t%d\t%s\n" % (
|
||||
tokens[i],
|
||||
"null",
|
||||
parse._arcs[i] + 1,
|
||||
"null",
|
||||
)
|
||||
# conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-')
|
||||
# Modify to comply with recent change in dependency graph such that there must be a ROOT element.
|
||||
conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
|
||||
i + 1,
|
||||
tokens[i],
|
||||
tokens[i],
|
||||
parse._tags[i],
|
||||
parse._tags[i],
|
||||
"null",
|
||||
parse._arcs[i] + 1,
|
||||
"ROOT",
|
||||
"-",
|
||||
"-",
|
||||
)
|
||||
dg = DependencyGraph(conll_format)
|
||||
score = self.compute_prob(dg)
|
||||
trees.append((score, dg.tree()))
|
||||
trees.sort()
|
||||
return (tree for (score, tree) in trees)
|
||||
|
||||
def concatenate(self, span1, span2):
|
||||
"""
|
||||
Concatenates the two spans in whichever way possible. This
|
||||
includes rightward concatenation (from the leftmost word of the
|
||||
leftmost span to the rightmost word of the rightmost span) and
|
||||
leftward concatenation (vice-versa) between adjacent spans. Unlike
|
||||
Eisner's presentation of span concatenation, these spans do not
|
||||
share or pivot on a particular word/word-index.
|
||||
|
||||
:return: A list of new spans formed through concatenation.
|
||||
:rtype: list(DependencySpan)
|
||||
"""
|
||||
spans = []
|
||||
if span1._start_index == span2._start_index:
|
||||
print("Error: Mismatched spans - replace this with thrown error")
|
||||
if span1._start_index > span2._start_index:
|
||||
temp_span = span1
|
||||
span1 = span2
|
||||
span2 = temp_span
|
||||
# adjacent rightward covered concatenation
|
||||
new_arcs = span1._arcs + span2._arcs
|
||||
new_tags = span1._tags + span2._tags
|
||||
if self._grammar.contains(
|
||||
self._tokens[span1._head_index], self._tokens[span2._head_index]
|
||||
):
|
||||
new_arcs[span2._head_index - span1._start_index] = span1._head_index
|
||||
spans.append(
|
||||
DependencySpan(
|
||||
span1._start_index,
|
||||
span2._end_index,
|
||||
span1._head_index,
|
||||
new_arcs,
|
||||
new_tags,
|
||||
)
|
||||
)
|
||||
# adjacent leftward covered concatenation
|
||||
new_arcs = span1._arcs + span2._arcs
|
||||
new_tags = span1._tags + span2._tags
|
||||
if self._grammar.contains(
|
||||
self._tokens[span2._head_index], self._tokens[span1._head_index]
|
||||
):
|
||||
new_arcs[span1._head_index - span1._start_index] = span2._head_index
|
||||
spans.append(
|
||||
DependencySpan(
|
||||
span1._start_index,
|
||||
span2._end_index,
|
||||
span2._head_index,
|
||||
new_arcs,
|
||||
new_tags,
|
||||
)
|
||||
)
|
||||
return spans
|
||||
|
||||
def train(self, graphs):
|
||||
"""
|
||||
Trains a ProbabilisticDependencyGrammar based on the list of input
|
||||
DependencyGraphs. This model is an implementation of Eisner's (1996)
|
||||
Model C, which derives its statistics from head-word, head-tag,
|
||||
child-word, and child-tag relationships.
|
||||
|
||||
:param graphs: A list of dependency graphs to train from.
|
||||
:type: list(DependencyGraph)
|
||||
"""
|
||||
productions = []
|
||||
events = defaultdict(int)
|
||||
tags = {}
|
||||
for dg in graphs:
|
||||
for node_index in range(1, len(dg.nodes)):
|
||||
# children = dg.nodes[node_index]['deps']
|
||||
children = list(
|
||||
chain.from_iterable(dg.nodes[node_index]["deps"].values())
|
||||
)
|
||||
|
||||
nr_left_children = dg.left_children(node_index)
|
||||
nr_right_children = dg.right_children(node_index)
|
||||
nr_children = nr_left_children + nr_right_children
|
||||
for child_index in range(
|
||||
0 - (nr_left_children + 1), nr_right_children + 2
|
||||
):
|
||||
head_word = dg.nodes[node_index]["word"]
|
||||
head_tag = dg.nodes[node_index]["tag"]
|
||||
if head_word in tags:
|
||||
tags[head_word].add(head_tag)
|
||||
else:
|
||||
tags[head_word] = {head_tag}
|
||||
child = "STOP"
|
||||
child_tag = "STOP"
|
||||
prev_word = "START"
|
||||
prev_tag = "START"
|
||||
if child_index < 0:
|
||||
array_index = child_index + nr_left_children
|
||||
if array_index >= 0:
|
||||
child = dg.nodes[children[array_index]]["word"]
|
||||
child_tag = dg.nodes[children[array_index]]["tag"]
|
||||
if child_index != -1:
|
||||
prev_word = dg.nodes[children[array_index + 1]]["word"]
|
||||
prev_tag = dg.nodes[children[array_index + 1]]["tag"]
|
||||
if child != "STOP":
|
||||
productions.append(DependencyProduction(head_word, [child]))
|
||||
head_event = "(head ({} {}) (mods ({}, {}, {}) left))".format(
|
||||
child,
|
||||
child_tag,
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
mod_event = "(mods ({}, {}, {}) left))".format(
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
events[head_event] += 1
|
||||
events[mod_event] += 1
|
||||
elif child_index > 0:
|
||||
array_index = child_index + nr_left_children - 1
|
||||
if array_index < nr_children:
|
||||
child = dg.nodes[children[array_index]]["word"]
|
||||
child_tag = dg.nodes[children[array_index]]["tag"]
|
||||
if child_index != 1:
|
||||
prev_word = dg.nodes[children[array_index - 1]]["word"]
|
||||
prev_tag = dg.nodes[children[array_index - 1]]["tag"]
|
||||
if child != "STOP":
|
||||
productions.append(DependencyProduction(head_word, [child]))
|
||||
head_event = "(head ({} {}) (mods ({}, {}, {}) right))".format(
|
||||
child,
|
||||
child_tag,
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
mod_event = "(mods ({}, {}, {}) right))".format(
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
events[head_event] += 1
|
||||
events[mod_event] += 1
|
||||
self._grammar = ProbabilisticDependencyGrammar(productions, events, tags)
|
||||
|
||||
def compute_prob(self, dg):
|
||||
"""
|
||||
Computes the probability of a dependency graph based
|
||||
on the parser's probability model (defined by the parser's
|
||||
statistical dependency grammar).
|
||||
|
||||
:param dg: A dependency graph to score.
|
||||
:type dg: DependencyGraph
|
||||
:return: The probability of the dependency graph.
|
||||
:rtype: int
|
||||
"""
|
||||
prob = 1.0
|
||||
for node_index in range(1, len(dg.nodes)):
|
||||
# children = dg.nodes[node_index]['deps']
|
||||
children = list(chain.from_iterable(dg.nodes[node_index]["deps"].values()))
|
||||
|
||||
nr_left_children = dg.left_children(node_index)
|
||||
nr_right_children = dg.right_children(node_index)
|
||||
nr_children = nr_left_children + nr_right_children
|
||||
for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2):
|
||||
head_word = dg.nodes[node_index]["word"]
|
||||
head_tag = dg.nodes[node_index]["tag"]
|
||||
child = "STOP"
|
||||
child_tag = "STOP"
|
||||
prev_word = "START"
|
||||
prev_tag = "START"
|
||||
if child_index < 0:
|
||||
array_index = child_index + nr_left_children
|
||||
if array_index >= 0:
|
||||
child = dg.nodes[children[array_index]]["word"]
|
||||
child_tag = dg.nodes[children[array_index]]["tag"]
|
||||
if child_index != -1:
|
||||
prev_word = dg.nodes[children[array_index + 1]]["word"]
|
||||
prev_tag = dg.nodes[children[array_index + 1]]["tag"]
|
||||
head_event = "(head ({} {}) (mods ({}, {}, {}) left))".format(
|
||||
child,
|
||||
child_tag,
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
mod_event = "(mods ({}, {}, {}) left))".format(
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
h_count = self._grammar._events[head_event]
|
||||
m_count = self._grammar._events[mod_event]
|
||||
|
||||
# If the grammar is not covered
|
||||
if m_count != 0:
|
||||
prob *= h_count / m_count
|
||||
else:
|
||||
prob = 0.00000001 # Very small number
|
||||
|
||||
elif child_index > 0:
|
||||
array_index = child_index + nr_left_children - 1
|
||||
if array_index < nr_children:
|
||||
child = dg.nodes[children[array_index]]["word"]
|
||||
child_tag = dg.nodes[children[array_index]]["tag"]
|
||||
if child_index != 1:
|
||||
prev_word = dg.nodes[children[array_index - 1]]["word"]
|
||||
prev_tag = dg.nodes[children[array_index - 1]]["tag"]
|
||||
head_event = "(head ({} {}) (mods ({}, {}, {}) right))".format(
|
||||
child,
|
||||
child_tag,
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
mod_event = "(mods ({}, {}, {}) right))".format(
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
h_count = self._grammar._events[head_event]
|
||||
m_count = self._grammar._events[mod_event]
|
||||
|
||||
if m_count != 0:
|
||||
prob *= h_count / m_count
|
||||
else:
|
||||
prob = 0.00000001 # Very small number
|
||||
|
||||
return prob
|
||||
|
||||
|
||||
#################################################################
|
||||
# Demos
|
||||
#################################################################
|
||||
|
||||
|
||||
def demo():
|
||||
projective_rule_parse_demo()
|
||||
# arity_parse_demo()
|
||||
projective_prob_parse_demo()
|
||||
|
||||
|
||||
def projective_rule_parse_demo():
|
||||
"""
|
||||
A demonstration showing the creation and use of a
|
||||
``DependencyGrammar`` to perform a projective dependency
|
||||
parse.
|
||||
"""
|
||||
grammar = DependencyGrammar.fromstring(
|
||||
"""
|
||||
'scratch' -> 'cats' | 'walls'
|
||||
'walls' -> 'the'
|
||||
'cats' -> 'the'
|
||||
"""
|
||||
)
|
||||
print(grammar)
|
||||
pdp = ProjectiveDependencyParser(grammar)
|
||||
trees = pdp.parse(["the", "cats", "scratch", "the", "walls"])
|
||||
for tree in trees:
|
||||
print(tree)
|
||||
|
||||
|
||||
def arity_parse_demo():
|
||||
"""
|
||||
A demonstration showing the creation of a ``DependencyGrammar``
|
||||
in which a specific number of modifiers is listed for a given
|
||||
head. This can further constrain the number of possible parses
|
||||
created by a ``ProjectiveDependencyParser``.
|
||||
"""
|
||||
print()
|
||||
print("A grammar with no arity constraints. Each DependencyProduction")
|
||||
print("specifies a relationship between one head word and only one")
|
||||
print("modifier word.")
|
||||
grammar = DependencyGrammar.fromstring(
|
||||
"""
|
||||
'fell' -> 'price' | 'stock'
|
||||
'price' -> 'of' | 'the'
|
||||
'of' -> 'stock'
|
||||
'stock' -> 'the'
|
||||
"""
|
||||
)
|
||||
print(grammar)
|
||||
|
||||
print()
|
||||
print("For the sentence 'The price of the stock fell', this grammar")
|
||||
print("will produce the following three parses:")
|
||||
pdp = ProjectiveDependencyParser(grammar)
|
||||
trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"])
|
||||
for tree in trees:
|
||||
print(tree)
|
||||
|
||||
print()
|
||||
print("By contrast, the following grammar contains a ")
|
||||
print("DependencyProduction that specifies a relationship")
|
||||
print("between a single head word, 'price', and two modifier")
|
||||
print("words, 'of' and 'the'.")
|
||||
grammar = DependencyGrammar.fromstring(
|
||||
"""
|
||||
'fell' -> 'price' | 'stock'
|
||||
'price' -> 'of' 'the'
|
||||
'of' -> 'stock'
|
||||
'stock' -> 'the'
|
||||
"""
|
||||
)
|
||||
print(grammar)
|
||||
|
||||
print()
|
||||
print(
|
||||
"This constrains the number of possible parses to just one:"
|
||||
) # unimplemented, soon to replace
|
||||
pdp = ProjectiveDependencyParser(grammar)
|
||||
trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"])
|
||||
for tree in trees:
|
||||
print(tree)
|
||||
|
||||
|
||||
def projective_prob_parse_demo():
|
||||
"""
|
||||
A demo showing the training and use of a projective
|
||||
dependency parser.
|
||||
"""
|
||||
from nltk.parse.dependencygraph import conll_data2
|
||||
|
||||
graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
|
||||
ppdp = ProbabilisticProjectiveDependencyParser()
|
||||
print("Training Probabilistic Projective Dependency Parser...")
|
||||
ppdp.train(graphs)
|
||||
|
||||
sent = ["Cathy", "zag", "hen", "wild", "zwaaien", "."]
|
||||
print("Parsing '", " ".join(sent), "'...")
|
||||
print("Parse:")
|
||||
for tree in ppdp.parse(sent):
|
||||
print(tree)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
684
backend/venv/Lib/site-packages/nltk/parse/recursivedescent.py
Normal file
684
backend/venv/Lib/site-packages/nltk/parse/recursivedescent.py
Normal file
@@ -0,0 +1,684 @@
|
||||
# Natural Language Toolkit: Recursive Descent Parser
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.grammar import Nonterminal
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.tree import ImmutableTree, Tree
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Recursive Descent Parser
|
||||
##//////////////////////////////////////////////////////
|
||||
class RecursiveDescentParser(ParserI):
|
||||
"""
|
||||
A simple top-down CFG parser that parses texts by recursively
|
||||
expanding the fringe of a Tree, and matching it against a
|
||||
text.
|
||||
|
||||
``RecursiveDescentParser`` uses a list of tree locations called a
|
||||
"frontier" to remember which subtrees have not yet been expanded
|
||||
and which leaves have not yet been matched against the text. Each
|
||||
tree location consists of a list of child indices specifying the
|
||||
path from the root of the tree to a subtree or a leaf; see the
|
||||
reference documentation for Tree for more information
|
||||
about tree locations.
|
||||
|
||||
When the parser begins parsing a text, it constructs a tree
|
||||
containing only the start symbol, and a frontier containing the
|
||||
location of the tree's root node. It then extends the tree to
|
||||
cover the text, using the following recursive procedure:
|
||||
|
||||
- If the frontier is empty, and the text is covered by the tree,
|
||||
then return the tree as a possible parse.
|
||||
- If the frontier is empty, and the text is not covered by the
|
||||
tree, then return no parses.
|
||||
- If the first element of the frontier is a subtree, then
|
||||
use CFG productions to "expand" it. For each applicable
|
||||
production, add the expanded subtree's children to the
|
||||
frontier, and recursively find all parses that can be
|
||||
generated by the new tree and frontier.
|
||||
- If the first element of the frontier is a token, then "match"
|
||||
it against the next token from the text. Remove the token
|
||||
from the frontier, and recursively find all parses that can be
|
||||
generated by the new tree and frontier.
|
||||
|
||||
:see: ``nltk.grammar``
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, trace=0):
|
||||
"""
|
||||
Create a new ``RecursiveDescentParser``, that uses ``grammar``
|
||||
to parse texts.
|
||||
|
||||
:type grammar: CFG
|
||||
:param grammar: The grammar used to parse texts.
|
||||
:type trace: int
|
||||
:param trace: The level of tracing that should be used when
|
||||
parsing a text. ``0`` will generate no tracing output;
|
||||
and higher numbers will produce more verbose tracing
|
||||
output.
|
||||
"""
|
||||
self._grammar = grammar
|
||||
self._trace = trace
|
||||
|
||||
def grammar(self):
|
||||
return self._grammar
|
||||
|
||||
def parse(self, tokens):
|
||||
# Inherit docs from ParserI
|
||||
|
||||
tokens = list(tokens)
|
||||
self._grammar.check_coverage(tokens)
|
||||
|
||||
# Start a recursive descent parse, with an initial tree
|
||||
# containing just the start symbol.
|
||||
start = self._grammar.start().symbol()
|
||||
initial_tree = Tree(start, [])
|
||||
frontier = [()]
|
||||
if self._trace:
|
||||
self._trace_start(initial_tree, frontier, tokens)
|
||||
return self._parse(tokens, initial_tree, frontier)
|
||||
|
||||
def _parse(self, remaining_text, tree, frontier):
|
||||
"""
|
||||
Recursively expand and match each elements of ``tree``
|
||||
specified by ``frontier``, to cover ``remaining_text``. Return
|
||||
a list of all parses found.
|
||||
|
||||
:return: An iterator of all parses that can be generated by
|
||||
matching and expanding the elements of ``tree``
|
||||
specified by ``frontier``.
|
||||
:rtype: iter(Tree)
|
||||
:type tree: Tree
|
||||
:param tree: A partial structure for the text that is
|
||||
currently being parsed. The elements of ``tree``
|
||||
that are specified by ``frontier`` have not yet been
|
||||
expanded or matched.
|
||||
:type remaining_text: list(str)
|
||||
:param remaining_text: The portion of the text that is not yet
|
||||
covered by ``tree``.
|
||||
:type frontier: list(tuple(int))
|
||||
:param frontier: A list of the locations within ``tree`` of
|
||||
all subtrees that have not yet been expanded, and all
|
||||
leaves that have not yet been matched. This list sorted
|
||||
in left-to-right order of location within the tree.
|
||||
"""
|
||||
|
||||
# If the tree covers the text, and there's nothing left to
|
||||
# expand, then we've found a complete parse; return it.
|
||||
if len(remaining_text) == 0 and len(frontier) == 0:
|
||||
if self._trace:
|
||||
self._trace_succeed(tree, frontier)
|
||||
yield tree
|
||||
|
||||
# If there's still text, but nothing left to expand, we failed.
|
||||
elif len(frontier) == 0:
|
||||
if self._trace:
|
||||
self._trace_backtrack(tree, frontier)
|
||||
|
||||
# If the next element on the frontier is a tree, expand it.
|
||||
elif isinstance(tree[frontier[0]], Tree):
|
||||
yield from self._expand(remaining_text, tree, frontier)
|
||||
|
||||
# If the next element on the frontier is a token, match it.
|
||||
else:
|
||||
yield from self._match(remaining_text, tree, frontier)
|
||||
|
||||
def _match(self, rtext, tree, frontier):
|
||||
"""
|
||||
:rtype: iter(Tree)
|
||||
:return: an iterator of all parses that can be generated by
|
||||
matching the first element of ``frontier`` against the
|
||||
first token in ``rtext``. In particular, if the first
|
||||
element of ``frontier`` has the same type as the first
|
||||
token in ``rtext``, then substitute the token into
|
||||
``tree``; and return all parses that can be generated by
|
||||
matching and expanding the remaining elements of
|
||||
``frontier``. If the first element of ``frontier`` does not
|
||||
have the same type as the first token in ``rtext``, then
|
||||
return empty list.
|
||||
|
||||
:type tree: Tree
|
||||
:param tree: A partial structure for the text that is
|
||||
currently being parsed. The elements of ``tree``
|
||||
that are specified by ``frontier`` have not yet been
|
||||
expanded or matched.
|
||||
:type rtext: list(str)
|
||||
:param rtext: The portion of the text that is not yet
|
||||
covered by ``tree``.
|
||||
:type frontier: list of tuple of int
|
||||
:param frontier: A list of the locations within ``tree`` of
|
||||
all subtrees that have not yet been expanded, and all
|
||||
leaves that have not yet been matched.
|
||||
"""
|
||||
|
||||
tree_leaf = tree[frontier[0]]
|
||||
if len(rtext) > 0 and tree_leaf == rtext[0]:
|
||||
# If it's a terminal that matches rtext[0], then substitute
|
||||
# in the token, and continue parsing.
|
||||
newtree = tree.copy(deep=True)
|
||||
newtree[frontier[0]] = rtext[0]
|
||||
if self._trace:
|
||||
self._trace_match(newtree, frontier[1:], rtext[0])
|
||||
yield from self._parse(rtext[1:], newtree, frontier[1:])
|
||||
else:
|
||||
# If it's a non-matching terminal, fail.
|
||||
if self._trace:
|
||||
self._trace_backtrack(tree, frontier, rtext[:1])
|
||||
|
||||
def _expand(self, remaining_text, tree, frontier, production=None):
|
||||
"""
|
||||
:rtype: iter(Tree)
|
||||
:return: An iterator of all parses that can be generated by
|
||||
expanding the first element of ``frontier`` with
|
||||
``production``. In particular, if the first element of
|
||||
``frontier`` is a subtree whose node type is equal to
|
||||
``production``'s left hand side, then add a child to that
|
||||
subtree for each element of ``production``'s right hand
|
||||
side; and return all parses that can be generated by
|
||||
matching and expanding the remaining elements of
|
||||
``frontier``. If the first element of ``frontier`` is not a
|
||||
subtree whose node type is equal to ``production``'s left
|
||||
hand side, then return an empty list. If ``production`` is
|
||||
not specified, then return a list of all parses that can
|
||||
be generated by expanding the first element of ``frontier``
|
||||
with *any* CFG production.
|
||||
|
||||
:type tree: Tree
|
||||
:param tree: A partial structure for the text that is
|
||||
currently being parsed. The elements of ``tree``
|
||||
that are specified by ``frontier`` have not yet been
|
||||
expanded or matched.
|
||||
:type remaining_text: list(str)
|
||||
:param remaining_text: The portion of the text that is not yet
|
||||
covered by ``tree``.
|
||||
:type frontier: list(tuple(int))
|
||||
:param frontier: A list of the locations within ``tree`` of
|
||||
all subtrees that have not yet been expanded, and all
|
||||
leaves that have not yet been matched.
|
||||
"""
|
||||
|
||||
if production is None:
|
||||
productions = self._grammar.productions()
|
||||
else:
|
||||
productions = [production]
|
||||
|
||||
for production in productions:
|
||||
lhs = production.lhs().symbol()
|
||||
if lhs == tree[frontier[0]].label():
|
||||
subtree = self._production_to_tree(production)
|
||||
if frontier[0] == ():
|
||||
newtree = subtree
|
||||
else:
|
||||
newtree = tree.copy(deep=True)
|
||||
newtree[frontier[0]] = subtree
|
||||
new_frontier = [
|
||||
frontier[0] + (i,) for i in range(len(production.rhs()))
|
||||
]
|
||||
if self._trace:
|
||||
self._trace_expand(newtree, new_frontier, production)
|
||||
yield from self._parse(
|
||||
remaining_text, newtree, new_frontier + frontier[1:]
|
||||
)
|
||||
|
||||
def _production_to_tree(self, production):
|
||||
"""
|
||||
:rtype: Tree
|
||||
:return: The Tree that is licensed by ``production``.
|
||||
In particular, given the production ``[lhs -> elt[1] ... elt[n]]``
|
||||
return a tree that has a node ``lhs.symbol``, and
|
||||
``n`` children. For each nonterminal element
|
||||
``elt[i]`` in the production, the tree token has a
|
||||
childless subtree with node value ``elt[i].symbol``; and
|
||||
for each terminal element ``elt[j]``, the tree token has
|
||||
a leaf token with type ``elt[j]``.
|
||||
|
||||
:param production: The CFG production that licenses the tree
|
||||
token that should be returned.
|
||||
:type production: Production
|
||||
"""
|
||||
children = []
|
||||
for elt in production.rhs():
|
||||
if isinstance(elt, Nonterminal):
|
||||
children.append(Tree(elt.symbol(), []))
|
||||
else:
|
||||
# This will be matched.
|
||||
children.append(elt)
|
||||
return Tree(production.lhs().symbol(), children)
|
||||
|
||||
def trace(self, trace=2):
|
||||
"""
|
||||
Set the level of tracing output that should be generated when
|
||||
parsing a text.
|
||||
|
||||
:type trace: int
|
||||
:param trace: The trace level. A trace level of ``0`` will
|
||||
generate no tracing output; and higher trace levels will
|
||||
produce more verbose tracing output.
|
||||
:rtype: None
|
||||
"""
|
||||
self._trace = trace
|
||||
|
||||
def _trace_fringe(self, tree, treeloc=None):
|
||||
"""
|
||||
Print trace output displaying the fringe of ``tree``. The
|
||||
fringe of ``tree`` consists of all of its leaves and all of
|
||||
its childless subtrees.
|
||||
|
||||
:rtype: None
|
||||
"""
|
||||
|
||||
if treeloc == ():
|
||||
print("*", end=" ")
|
||||
if isinstance(tree, Tree):
|
||||
if len(tree) == 0:
|
||||
print(repr(Nonterminal(tree.label())), end=" ")
|
||||
for i in range(len(tree)):
|
||||
if treeloc is not None and i == treeloc[0]:
|
||||
self._trace_fringe(tree[i], treeloc[1:])
|
||||
else:
|
||||
self._trace_fringe(tree[i])
|
||||
else:
|
||||
print(repr(tree), end=" ")
|
||||
|
||||
def _trace_tree(self, tree, frontier, operation):
|
||||
"""
|
||||
Print trace output displaying the parser's current state.
|
||||
|
||||
:param operation: A character identifying the operation that
|
||||
generated the current state.
|
||||
:rtype: None
|
||||
"""
|
||||
if self._trace == 2:
|
||||
print(" %c [" % operation, end=" ")
|
||||
else:
|
||||
print(" [", end=" ")
|
||||
if len(frontier) > 0:
|
||||
self._trace_fringe(tree, frontier[0])
|
||||
else:
|
||||
self._trace_fringe(tree)
|
||||
print("]")
|
||||
|
||||
def _trace_start(self, tree, frontier, text):
|
||||
print("Parsing %r" % " ".join(text))
|
||||
if self._trace > 2:
|
||||
print("Start:")
|
||||
if self._trace > 1:
|
||||
self._trace_tree(tree, frontier, " ")
|
||||
|
||||
def _trace_expand(self, tree, frontier, production):
|
||||
if self._trace > 2:
|
||||
print("Expand: %s" % production)
|
||||
if self._trace > 1:
|
||||
self._trace_tree(tree, frontier, "E")
|
||||
|
||||
def _trace_match(self, tree, frontier, tok):
|
||||
if self._trace > 2:
|
||||
print("Match: %r" % tok)
|
||||
if self._trace > 1:
|
||||
self._trace_tree(tree, frontier, "M")
|
||||
|
||||
def _trace_succeed(self, tree, frontier):
|
||||
if self._trace > 2:
|
||||
print("GOOD PARSE:")
|
||||
if self._trace == 1:
|
||||
print("Found a parse:\n%s" % tree)
|
||||
if self._trace > 1:
|
||||
self._trace_tree(tree, frontier, "+")
|
||||
|
||||
def _trace_backtrack(self, tree, frontier, toks=None):
|
||||
if self._trace > 2:
|
||||
if toks:
|
||||
print("Backtrack: %r match failed" % toks[0])
|
||||
else:
|
||||
print("Backtrack")
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Stepping Recursive Descent Parser
|
||||
##//////////////////////////////////////////////////////
|
||||
class SteppingRecursiveDescentParser(RecursiveDescentParser):
|
||||
"""
|
||||
A ``RecursiveDescentParser`` that allows you to step through the
|
||||
parsing process, performing a single operation at a time.
|
||||
|
||||
The ``initialize`` method is used to start parsing a text.
|
||||
``expand`` expands the first element on the frontier using a single
|
||||
CFG production, and ``match`` matches the first element on the
|
||||
frontier against the next text token. ``backtrack`` undoes the most
|
||||
recent expand or match operation. ``step`` performs a single
|
||||
expand, match, or backtrack operation. ``parses`` returns the set
|
||||
of parses that have been found by the parser.
|
||||
|
||||
:ivar _history: A list of ``(rtext, tree, frontier)`` tripples,
|
||||
containing the previous states of the parser. This history is
|
||||
used to implement the ``backtrack`` operation.
|
||||
:ivar _tried_e: A record of all productions that have been tried
|
||||
for a given tree. This record is used by ``expand`` to perform
|
||||
the next untried production.
|
||||
:ivar _tried_m: A record of what tokens have been matched for a
|
||||
given tree. This record is used by ``step`` to decide whether
|
||||
or not to match a token.
|
||||
:see: ``nltk.grammar``
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, trace=0):
|
||||
super().__init__(grammar, trace)
|
||||
self._rtext = None
|
||||
self._tree = None
|
||||
self._frontier = [()]
|
||||
self._tried_e = {}
|
||||
self._tried_m = {}
|
||||
self._history = []
|
||||
self._parses = []
|
||||
|
||||
# [XX] TEMPORARY HACK WARNING! This should be replaced with
|
||||
# something nicer when we get the chance.
|
||||
def _freeze(self, tree):
|
||||
c = tree.copy()
|
||||
# for pos in c.treepositions('leaves'):
|
||||
# c[pos] = c[pos].freeze()
|
||||
return ImmutableTree.convert(c)
|
||||
|
||||
def parse(self, tokens):
|
||||
tokens = list(tokens)
|
||||
self.initialize(tokens)
|
||||
while self.step() is not None:
|
||||
pass
|
||||
return self.parses()
|
||||
|
||||
def initialize(self, tokens):
|
||||
"""
|
||||
Start parsing a given text. This sets the parser's tree to
|
||||
the start symbol, its frontier to the root node, and its
|
||||
remaining text to ``token['SUBTOKENS']``.
|
||||
"""
|
||||
|
||||
self._rtext = tokens
|
||||
start = self._grammar.start().symbol()
|
||||
self._tree = Tree(start, [])
|
||||
self._frontier = [()]
|
||||
self._tried_e = {}
|
||||
self._tried_m = {}
|
||||
self._history = []
|
||||
self._parses = []
|
||||
if self._trace:
|
||||
self._trace_start(self._tree, self._frontier, self._rtext)
|
||||
|
||||
def remaining_text(self):
|
||||
"""
|
||||
:return: The portion of the text that is not yet covered by the
|
||||
tree.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return self._rtext
|
||||
|
||||
def frontier(self):
|
||||
"""
|
||||
:return: A list of the tree locations of all subtrees that
|
||||
have not yet been expanded, and all leaves that have not
|
||||
yet been matched.
|
||||
:rtype: list(tuple(int))
|
||||
"""
|
||||
return self._frontier
|
||||
|
||||
def tree(self):
|
||||
"""
|
||||
:return: A partial structure for the text that is
|
||||
currently being parsed. The elements specified by the
|
||||
frontier have not yet been expanded or matched.
|
||||
:rtype: Tree
|
||||
"""
|
||||
return self._tree
|
||||
|
||||
def step(self):
|
||||
"""
|
||||
Perform a single parsing operation. If an untried match is
|
||||
possible, then perform the match, and return the matched
|
||||
token. If an untried expansion is possible, then perform the
|
||||
expansion, and return the production that it is based on. If
|
||||
backtracking is possible, then backtrack, and return True.
|
||||
Otherwise, return None.
|
||||
|
||||
:return: None if no operation was performed; a token if a match
|
||||
was performed; a production if an expansion was performed;
|
||||
and True if a backtrack operation was performed.
|
||||
:rtype: Production or String or bool
|
||||
"""
|
||||
# Try matching (if we haven't already)
|
||||
if self.untried_match():
|
||||
token = self.match()
|
||||
if token is not None:
|
||||
return token
|
||||
|
||||
# Try expanding.
|
||||
production = self.expand()
|
||||
if production is not None:
|
||||
return production
|
||||
|
||||
# Try backtracking
|
||||
if self.backtrack():
|
||||
self._trace_backtrack(self._tree, self._frontier)
|
||||
return True
|
||||
|
||||
# Nothing left to do.
|
||||
return None
|
||||
|
||||
def expand(self, production=None):
|
||||
"""
|
||||
Expand the first element of the frontier. In particular, if
|
||||
the first element of the frontier is a subtree whose node type
|
||||
is equal to ``production``'s left hand side, then add a child
|
||||
to that subtree for each element of ``production``'s right hand
|
||||
side. If ``production`` is not specified, then use the first
|
||||
untried expandable production. If all expandable productions
|
||||
have been tried, do nothing.
|
||||
|
||||
:return: The production used to expand the frontier, if an
|
||||
expansion was performed. If no expansion was performed,
|
||||
return None.
|
||||
:rtype: Production or None
|
||||
"""
|
||||
|
||||
# Make sure we *can* expand.
|
||||
if len(self._frontier) == 0:
|
||||
return None
|
||||
if not isinstance(self._tree[self._frontier[0]], Tree):
|
||||
return None
|
||||
|
||||
# If they didn't specify a production, check all untried ones.
|
||||
if production is None:
|
||||
productions = self.untried_expandable_productions()
|
||||
else:
|
||||
productions = [production]
|
||||
|
||||
parses = []
|
||||
for prod in productions:
|
||||
# Record that we've tried this production now.
|
||||
self._tried_e.setdefault(self._freeze(self._tree), []).append(prod)
|
||||
|
||||
# Try expanding.
|
||||
for _result in self._expand(self._rtext, self._tree, self._frontier, prod):
|
||||
return prod
|
||||
|
||||
# We didn't expand anything.
|
||||
return None
|
||||
|
||||
def match(self):
|
||||
"""
|
||||
Match the first element of the frontier. In particular, if
|
||||
the first element of the frontier has the same type as the
|
||||
next text token, then substitute the text token into the tree.
|
||||
|
||||
:return: The token matched, if a match operation was
|
||||
performed. If no match was performed, return None
|
||||
:rtype: str or None
|
||||
"""
|
||||
|
||||
# Record that we've tried matching this token.
|
||||
tok = self._rtext[0]
|
||||
self._tried_m.setdefault(self._freeze(self._tree), []).append(tok)
|
||||
|
||||
# Make sure we *can* match.
|
||||
if len(self._frontier) == 0:
|
||||
return None
|
||||
if isinstance(self._tree[self._frontier[0]], Tree):
|
||||
return None
|
||||
|
||||
for _result in self._match(self._rtext, self._tree, self._frontier):
|
||||
# Return the token we just matched.
|
||||
return self._history[-1][0][0]
|
||||
return None
|
||||
|
||||
def backtrack(self):
|
||||
"""
|
||||
Return the parser to its state before the most recent
|
||||
match or expand operation. Calling ``undo`` repeatedly return
|
||||
the parser to successively earlier states. If no match or
|
||||
expand operations have been performed, ``undo`` will make no
|
||||
changes.
|
||||
|
||||
:return: true if an operation was successfully undone.
|
||||
:rtype: bool
|
||||
"""
|
||||
if len(self._history) == 0:
|
||||
return False
|
||||
(self._rtext, self._tree, self._frontier) = self._history.pop()
|
||||
return True
|
||||
|
||||
def expandable_productions(self):
|
||||
"""
|
||||
:return: A list of all the productions for which expansions
|
||||
are available for the current parser state.
|
||||
:rtype: list(Production)
|
||||
"""
|
||||
# Make sure we *can* expand.
|
||||
if len(self._frontier) == 0:
|
||||
return []
|
||||
frontier_child = self._tree[self._frontier[0]]
|
||||
if len(self._frontier) == 0 or not isinstance(frontier_child, Tree):
|
||||
return []
|
||||
|
||||
return [
|
||||
p
|
||||
for p in self._grammar.productions()
|
||||
if p.lhs().symbol() == frontier_child.label()
|
||||
]
|
||||
|
||||
def untried_expandable_productions(self):
|
||||
"""
|
||||
:return: A list of all the untried productions for which
|
||||
expansions are available for the current parser state.
|
||||
:rtype: list(Production)
|
||||
"""
|
||||
|
||||
tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
|
||||
return [p for p in self.expandable_productions() if p not in tried_expansions]
|
||||
|
||||
def untried_match(self):
|
||||
"""
|
||||
:return: Whether the first element of the frontier is a token
|
||||
that has not yet been matched.
|
||||
:rtype: bool
|
||||
"""
|
||||
|
||||
if len(self._rtext) == 0:
|
||||
return False
|
||||
tried_matches = self._tried_m.get(self._freeze(self._tree), [])
|
||||
return self._rtext[0] not in tried_matches
|
||||
|
||||
def currently_complete(self):
|
||||
"""
|
||||
:return: Whether the parser's current state represents a
|
||||
complete parse.
|
||||
:rtype: bool
|
||||
"""
|
||||
return len(self._frontier) == 0 and len(self._rtext) == 0
|
||||
|
||||
def _parse(self, remaining_text, tree, frontier):
|
||||
"""
|
||||
A stub version of ``_parse`` that sets the parsers current
|
||||
state to the given arguments. In ``RecursiveDescentParser``,
|
||||
the ``_parse`` method is used to recursively continue parsing a
|
||||
text. ``SteppingRecursiveDescentParser`` overrides it to
|
||||
capture these recursive calls. It records the parser's old
|
||||
state in the history (to allow for backtracking), and updates
|
||||
the parser's new state using the given arguments. Finally, it
|
||||
returns ``[1]``, which is used by ``match`` and ``expand`` to
|
||||
detect whether their operations were successful.
|
||||
|
||||
:return: ``[1]``
|
||||
:rtype: list of int
|
||||
"""
|
||||
self._history.append((self._rtext, self._tree, self._frontier))
|
||||
self._rtext = remaining_text
|
||||
self._tree = tree
|
||||
self._frontier = frontier
|
||||
|
||||
# Is it a good parse? If so, record it.
|
||||
if len(frontier) == 0 and len(remaining_text) == 0:
|
||||
self._parses.append(tree)
|
||||
self._trace_succeed(self._tree, self._frontier)
|
||||
|
||||
return [1]
|
||||
|
||||
def parses(self):
|
||||
"""
|
||||
:return: An iterator of the parses that have been found by this
|
||||
parser so far.
|
||||
:rtype: list of Tree
|
||||
"""
|
||||
return iter(self._parses)
|
||||
|
||||
def set_grammar(self, grammar):
|
||||
"""
|
||||
Change the grammar used to parse texts.
|
||||
|
||||
:param grammar: The new grammar.
|
||||
:type grammar: CFG
|
||||
"""
|
||||
self._grammar = grammar
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Demonstration Code
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
A demonstration of the recursive descent parser.
|
||||
"""
|
||||
|
||||
from nltk import CFG, parse
|
||||
|
||||
grammar = CFG.fromstring(
|
||||
"""
|
||||
S -> NP VP
|
||||
NP -> Det N | Det N PP
|
||||
VP -> V NP | V NP PP
|
||||
PP -> P NP
|
||||
NP -> 'I'
|
||||
N -> 'man' | 'park' | 'telescope' | 'dog'
|
||||
Det -> 'the' | 'a'
|
||||
P -> 'in' | 'with'
|
||||
V -> 'saw'
|
||||
"""
|
||||
)
|
||||
|
||||
for prod in grammar.productions():
|
||||
print(prod)
|
||||
|
||||
sent = "I saw a man in the park".split()
|
||||
parser = parse.RecursiveDescentParser(grammar, trace=2)
|
||||
for p in parser.parse(sent):
|
||||
print(p)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
478
backend/venv/Lib/site-packages/nltk/parse/shiftreduce.py
Normal file
478
backend/venv/Lib/site-packages/nltk/parse/shiftreduce.py
Normal file
@@ -0,0 +1,478 @@
|
||||
# Natural Language Toolkit: Shift-Reduce Parser
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.grammar import Nonterminal
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Shift/Reduce Parser
|
||||
##//////////////////////////////////////////////////////
|
||||
class ShiftReduceParser(ParserI):
|
||||
"""
|
||||
A simple bottom-up CFG parser that uses two operations, "shift"
|
||||
and "reduce", to find a single parse for a text.
|
||||
|
||||
``ShiftReduceParser`` maintains a stack, which records the
|
||||
structure of a portion of the text. This stack is a list of
|
||||
strings and Trees that collectively cover a portion of
|
||||
the text. For example, while parsing the sentence "the dog saw
|
||||
the man" with a typical grammar, ``ShiftReduceParser`` will produce
|
||||
the following stack, which covers "the dog saw"::
|
||||
|
||||
[(NP: (Det: 'the') (N: 'dog')), (V: 'saw')]
|
||||
|
||||
``ShiftReduceParser`` attempts to extend the stack to cover the
|
||||
entire text, and to combine the stack elements into a single tree,
|
||||
producing a complete parse for the sentence.
|
||||
|
||||
Initially, the stack is empty. It is extended to cover the text,
|
||||
from left to right, by repeatedly applying two operations:
|
||||
|
||||
- "shift" moves a token from the beginning of the text to the
|
||||
end of the stack.
|
||||
- "reduce" uses a CFG production to combine the rightmost stack
|
||||
elements into a single Tree.
|
||||
|
||||
Often, more than one operation can be performed on a given stack.
|
||||
In this case, ``ShiftReduceParser`` uses the following heuristics
|
||||
to decide which operation to perform:
|
||||
|
||||
- Only shift if no reductions are available.
|
||||
- If multiple reductions are available, then apply the reduction
|
||||
whose CFG production is listed earliest in the grammar.
|
||||
|
||||
Note that these heuristics are not guaranteed to choose an
|
||||
operation that leads to a parse of the text. Also, if multiple
|
||||
parses exists, ``ShiftReduceParser`` will return at most one of
|
||||
them.
|
||||
|
||||
:see: ``nltk.grammar``
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, trace=0):
|
||||
"""
|
||||
Create a new ``ShiftReduceParser``, that uses ``grammar`` to
|
||||
parse texts.
|
||||
|
||||
:type grammar: Grammar
|
||||
:param grammar: The grammar used to parse texts.
|
||||
:type trace: int
|
||||
:param trace: The level of tracing that should be used when
|
||||
parsing a text. ``0`` will generate no tracing output;
|
||||
and higher numbers will produce more verbose tracing
|
||||
output.
|
||||
"""
|
||||
self._grammar = grammar
|
||||
self._trace = trace
|
||||
self._check_grammar()
|
||||
|
||||
def grammar(self):
|
||||
return self._grammar
|
||||
|
||||
def parse(self, tokens):
|
||||
tokens = list(tokens)
|
||||
self._grammar.check_coverage(tokens)
|
||||
|
||||
# initialize the stack.
|
||||
stack = []
|
||||
remaining_text = tokens
|
||||
|
||||
# Trace output.
|
||||
if self._trace:
|
||||
print("Parsing %r" % " ".join(tokens))
|
||||
self._trace_stack(stack, remaining_text)
|
||||
|
||||
# iterate through the text, pushing the token onto
|
||||
# the stack, then reducing the stack.
|
||||
while len(remaining_text) > 0:
|
||||
self._shift(stack, remaining_text)
|
||||
while self._reduce(stack, remaining_text):
|
||||
pass
|
||||
|
||||
# Did we reduce everything?
|
||||
if len(stack) == 1:
|
||||
# Did we end up with the right category?
|
||||
if stack[0].label() == self._grammar.start().symbol():
|
||||
yield stack[0]
|
||||
|
||||
def _shift(self, stack, remaining_text):
|
||||
"""
|
||||
Move a token from the beginning of ``remaining_text`` to the
|
||||
end of ``stack``.
|
||||
|
||||
:type stack: list(str and Tree)
|
||||
:param stack: A list of strings and Trees, encoding
|
||||
the structure of the text that has been parsed so far.
|
||||
:type remaining_text: list(str)
|
||||
:param remaining_text: The portion of the text that is not yet
|
||||
covered by ``stack``.
|
||||
:rtype: None
|
||||
"""
|
||||
stack.append(remaining_text[0])
|
||||
remaining_text.remove(remaining_text[0])
|
||||
if self._trace:
|
||||
self._trace_shift(stack, remaining_text)
|
||||
|
||||
def _match_rhs(self, rhs, rightmost_stack):
|
||||
"""
|
||||
:rtype: bool
|
||||
:return: true if the right hand side of a CFG production
|
||||
matches the rightmost elements of the stack. ``rhs``
|
||||
matches ``rightmost_stack`` if they are the same length,
|
||||
and each element of ``rhs`` matches the corresponding
|
||||
element of ``rightmost_stack``. A nonterminal element of
|
||||
``rhs`` matches any Tree whose node value is equal
|
||||
to the nonterminal's symbol. A terminal element of ``rhs``
|
||||
matches any string whose type is equal to the terminal.
|
||||
:type rhs: list(terminal and Nonterminal)
|
||||
:param rhs: The right hand side of a CFG production.
|
||||
:type rightmost_stack: list(string and Tree)
|
||||
:param rightmost_stack: The rightmost elements of the parser's
|
||||
stack.
|
||||
"""
|
||||
|
||||
if len(rightmost_stack) != len(rhs):
|
||||
return False
|
||||
for i in range(len(rightmost_stack)):
|
||||
if isinstance(rightmost_stack[i], Tree):
|
||||
if not isinstance(rhs[i], Nonterminal):
|
||||
return False
|
||||
if rightmost_stack[i].label() != rhs[i].symbol():
|
||||
return False
|
||||
else:
|
||||
if isinstance(rhs[i], Nonterminal):
|
||||
return False
|
||||
if rightmost_stack[i] != rhs[i]:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _reduce(self, stack, remaining_text, production=None):
|
||||
"""
|
||||
Find a CFG production whose right hand side matches the
|
||||
rightmost stack elements; and combine those stack elements
|
||||
into a single Tree, with the node specified by the
|
||||
production's left-hand side. If more than one CFG production
|
||||
matches the stack, then use the production that is listed
|
||||
earliest in the grammar. The new Tree replaces the
|
||||
elements in the stack.
|
||||
|
||||
:rtype: Production or None
|
||||
:return: If a reduction is performed, then return the CFG
|
||||
production that the reduction is based on; otherwise,
|
||||
return false.
|
||||
:type stack: list(string and Tree)
|
||||
:param stack: A list of strings and Trees, encoding
|
||||
the structure of the text that has been parsed so far.
|
||||
:type remaining_text: list(str)
|
||||
:param remaining_text: The portion of the text that is not yet
|
||||
covered by ``stack``.
|
||||
"""
|
||||
if production is None:
|
||||
productions = self._grammar.productions()
|
||||
else:
|
||||
productions = [production]
|
||||
|
||||
# Try each production, in order.
|
||||
for production in productions:
|
||||
rhslen = len(production.rhs())
|
||||
|
||||
# check if the RHS of a production matches the top of the stack
|
||||
if self._match_rhs(production.rhs(), stack[-rhslen:]):
|
||||
# combine the tree to reflect the reduction
|
||||
tree = Tree(production.lhs().symbol(), stack[-rhslen:])
|
||||
stack[-rhslen:] = [tree]
|
||||
|
||||
# We reduced something
|
||||
if self._trace:
|
||||
self._trace_reduce(stack, production, remaining_text)
|
||||
return production
|
||||
|
||||
# We didn't reduce anything
|
||||
return None
|
||||
|
||||
def trace(self, trace=2):
|
||||
"""
|
||||
Set the level of tracing output that should be generated when
|
||||
parsing a text.
|
||||
|
||||
:type trace: int
|
||||
:param trace: The trace level. A trace level of ``0`` will
|
||||
generate no tracing output; and higher trace levels will
|
||||
produce more verbose tracing output.
|
||||
:rtype: None
|
||||
"""
|
||||
# 1: just show shifts.
|
||||
# 2: show shifts & reduces
|
||||
# 3: display which tokens & productions are shifed/reduced
|
||||
self._trace = trace
|
||||
|
||||
def _trace_stack(self, stack, remaining_text, marker=" "):
|
||||
"""
|
||||
Print trace output displaying the given stack and text.
|
||||
|
||||
:rtype: None
|
||||
:param marker: A character that is printed to the left of the
|
||||
stack. This is used with trace level 2 to print 'S'
|
||||
before shifted stacks and 'R' before reduced stacks.
|
||||
"""
|
||||
s = " " + marker + " [ "
|
||||
for elt in stack:
|
||||
if isinstance(elt, Tree):
|
||||
s += repr(Nonterminal(elt.label())) + " "
|
||||
else:
|
||||
s += repr(elt) + " "
|
||||
s += "* " + " ".join(remaining_text) + "]"
|
||||
print(s)
|
||||
|
||||
def _trace_shift(self, stack, remaining_text):
|
||||
"""
|
||||
Print trace output displaying that a token has been shifted.
|
||||
|
||||
:rtype: None
|
||||
"""
|
||||
if self._trace > 2:
|
||||
print("Shift %r:" % stack[-1])
|
||||
if self._trace == 2:
|
||||
self._trace_stack(stack, remaining_text, "S")
|
||||
elif self._trace > 0:
|
||||
self._trace_stack(stack, remaining_text)
|
||||
|
||||
def _trace_reduce(self, stack, production, remaining_text):
|
||||
"""
|
||||
Print trace output displaying that ``production`` was used to
|
||||
reduce ``stack``.
|
||||
|
||||
:rtype: None
|
||||
"""
|
||||
if self._trace > 2:
|
||||
rhs = " ".join(production.rhs())
|
||||
print(f"Reduce {production.lhs()!r} <- {rhs}")
|
||||
if self._trace == 2:
|
||||
self._trace_stack(stack, remaining_text, "R")
|
||||
elif self._trace > 1:
|
||||
self._trace_stack(stack, remaining_text)
|
||||
|
||||
def _check_grammar(self):
|
||||
"""
|
||||
Check to make sure that all of the CFG productions are
|
||||
potentially useful. If any productions can never be used,
|
||||
then print a warning.
|
||||
|
||||
:rtype: None
|
||||
"""
|
||||
productions = self._grammar.productions()
|
||||
|
||||
# Any production whose RHS is an extension of another production's RHS
|
||||
# will never be used.
|
||||
for i in range(len(productions)):
|
||||
for j in range(i + 1, len(productions)):
|
||||
rhs1 = productions[i].rhs()
|
||||
rhs2 = productions[j].rhs()
|
||||
if rhs1[: len(rhs2)] == rhs2:
|
||||
print("Warning: %r will never be used" % productions[i])
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Stepping Shift/Reduce Parser
|
||||
##//////////////////////////////////////////////////////
|
||||
class SteppingShiftReduceParser(ShiftReduceParser):
|
||||
"""
|
||||
A ``ShiftReduceParser`` that allows you to setp through the parsing
|
||||
process, performing a single operation at a time. It also allows
|
||||
you to change the parser's grammar midway through parsing a text.
|
||||
|
||||
The ``initialize`` method is used to start parsing a text.
|
||||
``shift`` performs a single shift operation, and ``reduce`` performs
|
||||
a single reduce operation. ``step`` will perform a single reduce
|
||||
operation if possible; otherwise, it will perform a single shift
|
||||
operation. ``parses`` returns the set of parses that have been
|
||||
found by the parser.
|
||||
|
||||
:ivar _history: A list of ``(stack, remaining_text)`` pairs,
|
||||
containing all of the previous states of the parser. This
|
||||
history is used to implement the ``undo`` operation.
|
||||
:see: ``nltk.grammar``
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, trace=0):
|
||||
super().__init__(grammar, trace)
|
||||
self._stack = None
|
||||
self._remaining_text = None
|
||||
self._history = []
|
||||
|
||||
def parse(self, tokens):
|
||||
tokens = list(tokens)
|
||||
self.initialize(tokens)
|
||||
while self.step():
|
||||
pass
|
||||
return self.parses()
|
||||
|
||||
def stack(self):
|
||||
"""
|
||||
:return: The parser's stack.
|
||||
:rtype: list(str and Tree)
|
||||
"""
|
||||
return self._stack
|
||||
|
||||
def remaining_text(self):
|
||||
"""
|
||||
:return: The portion of the text that is not yet covered by the
|
||||
stack.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return self._remaining_text
|
||||
|
||||
def initialize(self, tokens):
|
||||
"""
|
||||
Start parsing a given text. This sets the parser's stack to
|
||||
``[]`` and sets its remaining text to ``tokens``.
|
||||
"""
|
||||
self._stack = []
|
||||
self._remaining_text = tokens
|
||||
self._history = []
|
||||
|
||||
def step(self):
|
||||
"""
|
||||
Perform a single parsing operation. If a reduction is
|
||||
possible, then perform that reduction, and return the
|
||||
production that it is based on. Otherwise, if a shift is
|
||||
possible, then perform it, and return True. Otherwise,
|
||||
return False.
|
||||
|
||||
:return: False if no operation was performed; True if a shift was
|
||||
performed; and the CFG production used to reduce if a
|
||||
reduction was performed.
|
||||
:rtype: Production or bool
|
||||
"""
|
||||
return self.reduce() or self.shift()
|
||||
|
||||
def shift(self):
|
||||
"""
|
||||
Move a token from the beginning of the remaining text to the
|
||||
end of the stack. If there are no more tokens in the
|
||||
remaining text, then do nothing.
|
||||
|
||||
:return: True if the shift operation was successful.
|
||||
:rtype: bool
|
||||
"""
|
||||
if len(self._remaining_text) == 0:
|
||||
return False
|
||||
self._history.append((self._stack[:], self._remaining_text[:]))
|
||||
self._shift(self._stack, self._remaining_text)
|
||||
return True
|
||||
|
||||
def reduce(self, production=None):
|
||||
"""
|
||||
Use ``production`` to combine the rightmost stack elements into
|
||||
a single Tree. If ``production`` does not match the
|
||||
rightmost stack elements, then do nothing.
|
||||
|
||||
:return: The production used to reduce the stack, if a
|
||||
reduction was performed. If no reduction was performed,
|
||||
return None.
|
||||
|
||||
:rtype: Production or None
|
||||
"""
|
||||
self._history.append((self._stack[:], self._remaining_text[:]))
|
||||
return_val = self._reduce(self._stack, self._remaining_text, production)
|
||||
|
||||
if not return_val:
|
||||
self._history.pop()
|
||||
return return_val
|
||||
|
||||
def undo(self):
|
||||
"""
|
||||
Return the parser to its state before the most recent
|
||||
shift or reduce operation. Calling ``undo`` repeatedly return
|
||||
the parser to successively earlier states. If no shift or
|
||||
reduce operations have been performed, ``undo`` will make no
|
||||
changes.
|
||||
|
||||
:return: true if an operation was successfully undone.
|
||||
:rtype: bool
|
||||
"""
|
||||
if len(self._history) == 0:
|
||||
return False
|
||||
(self._stack, self._remaining_text) = self._history.pop()
|
||||
return True
|
||||
|
||||
def reducible_productions(self):
|
||||
"""
|
||||
:return: A list of the productions for which reductions are
|
||||
available for the current parser state.
|
||||
:rtype: list(Production)
|
||||
"""
|
||||
productions = []
|
||||
for production in self._grammar.productions():
|
||||
rhslen = len(production.rhs())
|
||||
if self._match_rhs(production.rhs(), self._stack[-rhslen:]):
|
||||
productions.append(production)
|
||||
return productions
|
||||
|
||||
def parses(self):
|
||||
"""
|
||||
:return: An iterator of the parses that have been found by this
|
||||
parser so far.
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
if (
|
||||
len(self._remaining_text) == 0
|
||||
and len(self._stack) == 1
|
||||
and self._stack[0].label() == self._grammar.start().symbol()
|
||||
):
|
||||
yield self._stack[0]
|
||||
|
||||
# copied from nltk.parser
|
||||
|
||||
def set_grammar(self, grammar):
|
||||
"""
|
||||
Change the grammar used to parse texts.
|
||||
|
||||
:param grammar: The new grammar.
|
||||
:type grammar: CFG
|
||||
"""
|
||||
self._grammar = grammar
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Demonstration Code
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
A demonstration of the shift-reduce parser.
|
||||
"""
|
||||
|
||||
from nltk import CFG, parse
|
||||
|
||||
grammar = CFG.fromstring(
|
||||
"""
|
||||
S -> NP VP
|
||||
NP -> Det N | Det N PP
|
||||
VP -> V NP | V NP PP
|
||||
PP -> P NP
|
||||
NP -> 'I'
|
||||
N -> 'man' | 'park' | 'telescope' | 'dog'
|
||||
Det -> 'the' | 'a'
|
||||
P -> 'in' | 'with'
|
||||
V -> 'saw'
|
||||
"""
|
||||
)
|
||||
|
||||
sent = "I saw a man in the park".split()
|
||||
|
||||
parser = parse.ShiftReduceParser(grammar, trace=2)
|
||||
for p in parser.parse(sent):
|
||||
print(p)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
468
backend/venv/Lib/site-packages/nltk/parse/stanford.py
Normal file
468
backend/venv/Lib/site-packages/nltk/parse/stanford.py
Normal file
@@ -0,0 +1,468 @@
|
||||
# Natural Language Toolkit: Interface to the Stanford Parser
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Xu <xxu@student.unimelb.edu.au>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import warnings
|
||||
from subprocess import PIPE
|
||||
|
||||
from nltk.internals import (
|
||||
_java_options,
|
||||
config_java,
|
||||
find_jar_iter,
|
||||
find_jars_within_path,
|
||||
java,
|
||||
)
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.parse.dependencygraph import DependencyGraph
|
||||
from nltk.tree import Tree
|
||||
|
||||
_stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml"
|
||||
|
||||
|
||||
class GenericStanfordParser(ParserI):
|
||||
"""Interface to the Stanford Parser"""
|
||||
|
||||
_MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar"
|
||||
_JAR = r"stanford-parser\.jar"
|
||||
_MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser"
|
||||
|
||||
_USE_STDIN = False
|
||||
_DOUBLE_SPACED_OUTPUT = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_to_jar=None,
|
||||
path_to_models_jar=None,
|
||||
model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
|
||||
encoding="utf8",
|
||||
verbose=False,
|
||||
java_options="-mx4g",
|
||||
corenlp_options="",
|
||||
):
|
||||
# find the most recent code and model jar
|
||||
stanford_jar = max(
|
||||
find_jar_iter(
|
||||
self._JAR,
|
||||
path_to_jar,
|
||||
env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"),
|
||||
searchpath=(),
|
||||
url=_stanford_url,
|
||||
verbose=verbose,
|
||||
is_regex=True,
|
||||
),
|
||||
key=lambda model_path: os.path.dirname(model_path),
|
||||
)
|
||||
|
||||
model_jar = max(
|
||||
find_jar_iter(
|
||||
self._MODEL_JAR_PATTERN,
|
||||
path_to_models_jar,
|
||||
env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"),
|
||||
searchpath=(),
|
||||
url=_stanford_url,
|
||||
verbose=verbose,
|
||||
is_regex=True,
|
||||
),
|
||||
key=lambda model_path: os.path.dirname(model_path),
|
||||
)
|
||||
|
||||
# self._classpath = (stanford_jar, model_jar)
|
||||
|
||||
# Adding logging jar files to classpath
|
||||
stanford_dir = os.path.split(stanford_jar)[0]
|
||||
self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir))
|
||||
|
||||
self.model_path = model_path
|
||||
self._encoding = encoding
|
||||
self.corenlp_options = corenlp_options
|
||||
self.java_options = java_options
|
||||
|
||||
def _parse_trees_output(self, output_):
|
||||
res = []
|
||||
cur_lines = []
|
||||
cur_trees = []
|
||||
blank = False
|
||||
for line in output_.splitlines(False):
|
||||
if line == "":
|
||||
if blank:
|
||||
res.append(iter(cur_trees))
|
||||
cur_trees = []
|
||||
blank = False
|
||||
elif self._DOUBLE_SPACED_OUTPUT:
|
||||
cur_trees.append(self._make_tree("\n".join(cur_lines)))
|
||||
cur_lines = []
|
||||
blank = True
|
||||
else:
|
||||
res.append(iter([self._make_tree("\n".join(cur_lines))]))
|
||||
cur_lines = []
|
||||
else:
|
||||
cur_lines.append(line)
|
||||
blank = False
|
||||
return iter(res)
|
||||
|
||||
def parse_sents(self, sentences, verbose=False):
|
||||
"""
|
||||
Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
|
||||
list where each sentence is a list of words.
|
||||
Each sentence will be automatically tagged with this StanfordParser instance's
|
||||
tagger.
|
||||
If whitespaces exists inside a token, then the token will be treated as
|
||||
separate tokens.
|
||||
|
||||
:param sentences: Input sentences to parse
|
||||
:type sentences: list(list(str))
|
||||
:rtype: iter(iter(Tree))
|
||||
"""
|
||||
cmd = [
|
||||
self._MAIN_CLASS,
|
||||
"-model",
|
||||
self.model_path,
|
||||
"-sentences",
|
||||
"newline",
|
||||
"-outputFormat",
|
||||
self._OUTPUT_FORMAT,
|
||||
"-tokenized",
|
||||
"-escaper",
|
||||
"edu.stanford.nlp.process.PTBEscapingProcessor",
|
||||
]
|
||||
return self._parse_trees_output(
|
||||
self._execute(
|
||||
cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose
|
||||
)
|
||||
)
|
||||
|
||||
def raw_parse(self, sentence, verbose=False):
|
||||
"""
|
||||
Use StanfordParser to parse a sentence. Takes a sentence as a string;
|
||||
before parsing, it will be automatically tokenized and tagged by
|
||||
the Stanford Parser.
|
||||
|
||||
:param sentence: Input sentence to parse
|
||||
:type sentence: str
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
return next(self.raw_parse_sents([sentence], verbose))
|
||||
|
||||
def raw_parse_sents(self, sentences, verbose=False):
|
||||
"""
|
||||
Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
|
||||
list of strings.
|
||||
Each sentence will be automatically tokenized and tagged by the Stanford Parser.
|
||||
|
||||
:param sentences: Input sentences to parse
|
||||
:type sentences: list(str)
|
||||
:rtype: iter(iter(Tree))
|
||||
"""
|
||||
cmd = [
|
||||
self._MAIN_CLASS,
|
||||
"-model",
|
||||
self.model_path,
|
||||
"-sentences",
|
||||
"newline",
|
||||
"-outputFormat",
|
||||
self._OUTPUT_FORMAT,
|
||||
]
|
||||
return self._parse_trees_output(
|
||||
self._execute(cmd, "\n".join(sentences), verbose)
|
||||
)
|
||||
|
||||
def tagged_parse(self, sentence, verbose=False):
|
||||
"""
|
||||
Use StanfordParser to parse a sentence. Takes a sentence as a list of
|
||||
(word, tag) tuples; the sentence must have already been tokenized and
|
||||
tagged.
|
||||
|
||||
:param sentence: Input sentence to parse
|
||||
:type sentence: list(tuple(str, str))
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
return next(self.tagged_parse_sents([sentence], verbose))
|
||||
|
||||
def tagged_parse_sents(self, sentences, verbose=False):
|
||||
"""
|
||||
Use StanfordParser to parse multiple sentences. Takes multiple sentences
|
||||
where each sentence is a list of (word, tag) tuples.
|
||||
The sentences must have already been tokenized and tagged.
|
||||
|
||||
:param sentences: Input sentences to parse
|
||||
:type sentences: list(list(tuple(str, str)))
|
||||
:rtype: iter(iter(Tree))
|
||||
"""
|
||||
tag_separator = "/"
|
||||
cmd = [
|
||||
self._MAIN_CLASS,
|
||||
"-model",
|
||||
self.model_path,
|
||||
"-sentences",
|
||||
"newline",
|
||||
"-outputFormat",
|
||||
self._OUTPUT_FORMAT,
|
||||
"-tokenized",
|
||||
"-tagSeparator",
|
||||
tag_separator,
|
||||
"-tokenizerFactory",
|
||||
"edu.stanford.nlp.process.WhitespaceTokenizer",
|
||||
"-tokenizerMethod",
|
||||
"newCoreLabelTokenizerFactory",
|
||||
]
|
||||
# We don't need to escape slashes as "splitting is done on the last instance of the character in the token"
|
||||
return self._parse_trees_output(
|
||||
self._execute(
|
||||
cmd,
|
||||
"\n".join(
|
||||
" ".join(tag_separator.join(tagged) for tagged in sentence)
|
||||
for sentence in sentences
|
||||
),
|
||||
verbose,
|
||||
)
|
||||
)
|
||||
|
||||
def _execute(self, cmd, input_, verbose=False):
|
||||
encoding = self._encoding
|
||||
cmd.extend(["-encoding", encoding])
|
||||
if self.corenlp_options:
|
||||
cmd.extend(self.corenlp_options.split())
|
||||
|
||||
default_options = " ".join(_java_options)
|
||||
|
||||
# Configure java.
|
||||
config_java(options=self.java_options, verbose=verbose)
|
||||
|
||||
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
|
||||
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
|
||||
# Write the actual sentences to the temporary input file
|
||||
if isinstance(input_, str) and encoding:
|
||||
input_ = input_.encode(encoding)
|
||||
input_file.write(input_)
|
||||
input_file.flush()
|
||||
|
||||
# Run the tagger and get the output.
|
||||
if self._USE_STDIN:
|
||||
input_file.seek(0)
|
||||
stdout, stderr = java(
|
||||
cmd,
|
||||
classpath=self._classpath,
|
||||
stdin=input_file,
|
||||
stdout=PIPE,
|
||||
stderr=PIPE,
|
||||
)
|
||||
else:
|
||||
cmd.append(input_file.name)
|
||||
stdout, stderr = java(
|
||||
cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
|
||||
)
|
||||
|
||||
stdout = stdout.replace(b"\xc2\xa0", b" ")
|
||||
stdout = stdout.replace(b"\x00\xa0", b" ")
|
||||
stdout = stdout.decode(encoding)
|
||||
|
||||
os.unlink(input_file.name)
|
||||
|
||||
# Return java configurations to their default values.
|
||||
config_java(options=default_options, verbose=False)
|
||||
|
||||
return stdout
|
||||
|
||||
|
||||
class StanfordParser(GenericStanfordParser):
|
||||
"""
|
||||
>>> parser=StanfordParser(
|
||||
... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
|
||||
... ) # doctest: +SKIP
|
||||
|
||||
>>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
|
||||
Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
|
||||
Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]
|
||||
|
||||
>>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
|
||||
... "the quick brown fox jumps over the lazy dog",
|
||||
... "the quick grey wolf jumps over the lazy fox"
|
||||
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
|
||||
Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
|
||||
Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
|
||||
[Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',
|
||||
[Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
|
||||
Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]
|
||||
|
||||
>>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
|
||||
... "I 'm a dog".split(),
|
||||
... "This is my friends ' cat ( the tabby )".split(),
|
||||
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
|
||||
Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
|
||||
[Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
|
||||
Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []),
|
||||
Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])]
|
||||
|
||||
>>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
|
||||
... (
|
||||
... ("The", "DT"),
|
||||
... ("quick", "JJ"),
|
||||
... ("brown", "JJ"),
|
||||
... ("fox", "NN"),
|
||||
... ("jumped", "VBD"),
|
||||
... ("over", "IN"),
|
||||
... ("the", "DT"),
|
||||
... ("lazy", "JJ"),
|
||||
... ("dog", "NN"),
|
||||
... (".", "."),
|
||||
... ),
|
||||
... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
|
||||
Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
|
||||
[Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
|
||||
"""
|
||||
|
||||
_OUTPUT_FORMAT = "penn"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
warnings.warn(
|
||||
"The StanfordParser will be deprecated\n"
|
||||
"Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _make_tree(self, result):
|
||||
return Tree.fromstring(result)
|
||||
|
||||
|
||||
class StanfordDependencyParser(GenericStanfordParser):
|
||||
"""
|
||||
>>> dep_parser=StanfordDependencyParser(
|
||||
... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
|
||||
... ) # doctest: +SKIP
|
||||
|
||||
>>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]
|
||||
|
||||
>>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
|
||||
((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
|
||||
((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
|
||||
((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
|
||||
|
||||
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
|
||||
... "The quick brown fox jumps over the lazy dog.",
|
||||
... "The quick grey wolf jumps over the lazy fox."
|
||||
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
|
||||
Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]
|
||||
|
||||
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
|
||||
... "I 'm a dog".split(),
|
||||
... "This is my friends ' cat ( the tabby )".split(),
|
||||
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]
|
||||
|
||||
>>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((
|
||||
... (
|
||||
... ("The", "DT"),
|
||||
... ("quick", "JJ"),
|
||||
... ("brown", "JJ"),
|
||||
... ("fox", "NN"),
|
||||
... ("jumped", "VBD"),
|
||||
... ("over", "IN"),
|
||||
... ("the", "DT"),
|
||||
... ("lazy", "JJ"),
|
||||
... ("dog", "NN"),
|
||||
... (".", "."),
|
||||
... ),
|
||||
... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
|
||||
((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
|
||||
((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
|
||||
((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
|
||||
|
||||
"""
|
||||
|
||||
_OUTPUT_FORMAT = "conll2007"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
warnings.warn(
|
||||
"The StanfordDependencyParser will be deprecated\n"
|
||||
"Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _make_tree(self, result):
|
||||
return DependencyGraph(result, top_relation_label="root")
|
||||
|
||||
|
||||
class StanfordNeuralDependencyParser(GenericStanfordParser):
|
||||
"""
|
||||
>>> from nltk.parse.stanford import StanfordNeuralDependencyParser # doctest: +SKIP
|
||||
>>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')# doctest: +SKIP
|
||||
|
||||
>>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]
|
||||
|
||||
>>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det',
|
||||
(u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'),
|
||||
u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')),
|
||||
((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det',
|
||||
(u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'),
|
||||
u'punct', (u'.', u'.'))]]
|
||||
|
||||
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
|
||||
... "The quick brown fox jumps over the lazy dog.",
|
||||
... "The quick grey wolf jumps over the lazy fox."
|
||||
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over',
|
||||
'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']),
|
||||
Tree('fox', ['over', 'the', 'lazy']), '.'])]
|
||||
|
||||
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
|
||||
... "I 'm a dog".split(),
|
||||
... "This is my friends ' cat ( the tabby )".split(),
|
||||
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
|
||||
['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
|
||||
"""
|
||||
|
||||
_OUTPUT_FORMAT = "conll"
|
||||
_MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
|
||||
_JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar"
|
||||
_MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar"
|
||||
_USE_STDIN = True
|
||||
_DOUBLE_SPACED_OUTPUT = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
warnings.warn(
|
||||
"The StanfordNeuralDependencyParser will be deprecated\n"
|
||||
"Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse"
|
||||
|
||||
def tagged_parse_sents(self, sentences, verbose=False):
|
||||
"""
|
||||
Currently unimplemented because the neural dependency parser (and
|
||||
the StanfordCoreNLP pipeline class) doesn't support passing in pre-
|
||||
tagged tokens.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"tagged_parse[_sents] is not supported by "
|
||||
"StanfordNeuralDependencyParser; use "
|
||||
"parse[_sents] or raw_parse[_sents] instead."
|
||||
)
|
||||
|
||||
def _make_tree(self, result):
|
||||
return DependencyGraph(result, top_relation_label="ROOT")
|
||||
793
backend/venv/Lib/site-packages/nltk/parse/transitionparser.py
Normal file
793
backend/venv/Lib/site-packages/nltk/parse/transitionparser.py
Normal file
@@ -0,0 +1,793 @@
|
||||
# Natural Language Toolkit: Arc-Standard and Arc-eager Transition Based Parsers
|
||||
#
|
||||
# Author: Long Duong <longdt219@gmail.com>
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import pickle
|
||||
import tempfile
|
||||
from copy import deepcopy
|
||||
from operator import itemgetter
|
||||
from os import remove
|
||||
|
||||
try:
|
||||
from numpy import array
|
||||
from scipy import sparse
|
||||
from sklearn import svm
|
||||
from sklearn.datasets import load_svmlight_file
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from nltk.parse import DependencyEvaluator, DependencyGraph, ParserI
|
||||
|
||||
|
||||
class Configuration:
|
||||
"""
|
||||
Class for holding configuration which is the partial analysis of the input sentence.
|
||||
The transition based parser aims at finding set of operators that transfer the initial
|
||||
configuration to the terminal configuration.
|
||||
|
||||
The configuration includes:
|
||||
- Stack: for storing partially proceeded words
|
||||
- Buffer: for storing remaining input words
|
||||
- Set of arcs: for storing partially built dependency tree
|
||||
|
||||
This class also provides a method to represent a configuration as list of features.
|
||||
"""
|
||||
|
||||
def __init__(self, dep_graph):
|
||||
"""
|
||||
:param dep_graph: the representation of an input in the form of dependency graph.
|
||||
:type dep_graph: DependencyGraph where the dependencies are not specified.
|
||||
"""
|
||||
# dep_graph.nodes contain list of token for a sentence
|
||||
self.stack = [0] # The root element
|
||||
self.buffer = list(range(1, len(dep_graph.nodes))) # The rest is in the buffer
|
||||
self.arcs = [] # empty set of arc
|
||||
self._tokens = dep_graph.nodes
|
||||
self._max_address = len(self.buffer)
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
"Stack : "
|
||||
+ str(self.stack)
|
||||
+ " Buffer : "
|
||||
+ str(self.buffer)
|
||||
+ " Arcs : "
|
||||
+ str(self.arcs)
|
||||
)
|
||||
|
||||
def _check_informative(self, feat, flag=False):
|
||||
"""
|
||||
Check whether a feature is informative
|
||||
The flag control whether "_" is informative or not
|
||||
"""
|
||||
if feat is None:
|
||||
return False
|
||||
if feat == "":
|
||||
return False
|
||||
if flag is False:
|
||||
if feat == "_":
|
||||
return False
|
||||
return True
|
||||
|
||||
def extract_features(self):
|
||||
"""
|
||||
Extract the set of features for the current configuration. Implement standard features as describe in
|
||||
Table 3.2 (page 31) in Dependency Parsing book by Sandra Kubler, Ryan McDonal, Joakim Nivre.
|
||||
Please note that these features are very basic.
|
||||
:return: list(str)
|
||||
"""
|
||||
result = []
|
||||
# Todo : can come up with more complicated features set for better
|
||||
# performance.
|
||||
if len(self.stack) > 0:
|
||||
# Stack 0
|
||||
stack_idx0 = self.stack[len(self.stack) - 1]
|
||||
token = self._tokens[stack_idx0]
|
||||
if self._check_informative(token["word"], True):
|
||||
result.append("STK_0_FORM_" + token["word"])
|
||||
if "lemma" in token and self._check_informative(token["lemma"]):
|
||||
result.append("STK_0_LEMMA_" + token["lemma"])
|
||||
if self._check_informative(token["tag"]):
|
||||
result.append("STK_0_POS_" + token["tag"])
|
||||
if "feats" in token and self._check_informative(token["feats"]):
|
||||
feats = token["feats"].split("|")
|
||||
for feat in feats:
|
||||
result.append("STK_0_FEATS_" + feat)
|
||||
# Stack 1
|
||||
if len(self.stack) > 1:
|
||||
stack_idx1 = self.stack[len(self.stack) - 2]
|
||||
token = self._tokens[stack_idx1]
|
||||
if self._check_informative(token["tag"]):
|
||||
result.append("STK_1_POS_" + token["tag"])
|
||||
|
||||
# Left most, right most dependency of stack[0]
|
||||
left_most = 1000000
|
||||
right_most = -1
|
||||
dep_left_most = ""
|
||||
dep_right_most = ""
|
||||
for wi, r, wj in self.arcs:
|
||||
if wi == stack_idx0:
|
||||
if (wj > wi) and (wj > right_most):
|
||||
right_most = wj
|
||||
dep_right_most = r
|
||||
if (wj < wi) and (wj < left_most):
|
||||
left_most = wj
|
||||
dep_left_most = r
|
||||
if self._check_informative(dep_left_most):
|
||||
result.append("STK_0_LDEP_" + dep_left_most)
|
||||
if self._check_informative(dep_right_most):
|
||||
result.append("STK_0_RDEP_" + dep_right_most)
|
||||
|
||||
# Check Buffered 0
|
||||
if len(self.buffer) > 0:
|
||||
# Buffer 0
|
||||
buffer_idx0 = self.buffer[0]
|
||||
token = self._tokens[buffer_idx0]
|
||||
if self._check_informative(token["word"], True):
|
||||
result.append("BUF_0_FORM_" + token["word"])
|
||||
if "lemma" in token and self._check_informative(token["lemma"]):
|
||||
result.append("BUF_0_LEMMA_" + token["lemma"])
|
||||
if self._check_informative(token["tag"]):
|
||||
result.append("BUF_0_POS_" + token["tag"])
|
||||
if "feats" in token and self._check_informative(token["feats"]):
|
||||
feats = token["feats"].split("|")
|
||||
for feat in feats:
|
||||
result.append("BUF_0_FEATS_" + feat)
|
||||
# Buffer 1
|
||||
if len(self.buffer) > 1:
|
||||
buffer_idx1 = self.buffer[1]
|
||||
token = self._tokens[buffer_idx1]
|
||||
if self._check_informative(token["word"], True):
|
||||
result.append("BUF_1_FORM_" + token["word"])
|
||||
if self._check_informative(token["tag"]):
|
||||
result.append("BUF_1_POS_" + token["tag"])
|
||||
if len(self.buffer) > 2:
|
||||
buffer_idx2 = self.buffer[2]
|
||||
token = self._tokens[buffer_idx2]
|
||||
if self._check_informative(token["tag"]):
|
||||
result.append("BUF_2_POS_" + token["tag"])
|
||||
if len(self.buffer) > 3:
|
||||
buffer_idx3 = self.buffer[3]
|
||||
token = self._tokens[buffer_idx3]
|
||||
if self._check_informative(token["tag"]):
|
||||
result.append("BUF_3_POS_" + token["tag"])
|
||||
# Left most, right most dependency of stack[0]
|
||||
left_most = 1000000
|
||||
right_most = -1
|
||||
dep_left_most = ""
|
||||
dep_right_most = ""
|
||||
for wi, r, wj in self.arcs:
|
||||
if wi == buffer_idx0:
|
||||
if (wj > wi) and (wj > right_most):
|
||||
right_most = wj
|
||||
dep_right_most = r
|
||||
if (wj < wi) and (wj < left_most):
|
||||
left_most = wj
|
||||
dep_left_most = r
|
||||
if self._check_informative(dep_left_most):
|
||||
result.append("BUF_0_LDEP_" + dep_left_most)
|
||||
if self._check_informative(dep_right_most):
|
||||
result.append("BUF_0_RDEP_" + dep_right_most)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class Transition:
|
||||
"""
|
||||
This class defines a set of transition which is applied to a configuration to get another configuration
|
||||
Note that for different parsing algorithm, the transition is different.
|
||||
"""
|
||||
|
||||
# Define set of transitions
|
||||
LEFT_ARC = "LEFTARC"
|
||||
RIGHT_ARC = "RIGHTARC"
|
||||
SHIFT = "SHIFT"
|
||||
REDUCE = "REDUCE"
|
||||
|
||||
def __init__(self, alg_option):
|
||||
"""
|
||||
:param alg_option: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
|
||||
:type alg_option: str
|
||||
"""
|
||||
self._algo = alg_option
|
||||
if alg_option not in [
|
||||
TransitionParser.ARC_STANDARD,
|
||||
TransitionParser.ARC_EAGER,
|
||||
]:
|
||||
raise ValueError(
|
||||
" Currently we only support %s and %s "
|
||||
% (TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER)
|
||||
)
|
||||
|
||||
def left_arc(self, conf, relation):
|
||||
"""
|
||||
Note that the algorithm for left-arc is quite similar except for precondition for both arc-standard and arc-eager
|
||||
|
||||
:param configuration: is the current configuration
|
||||
:return: A new configuration or -1 if the pre-condition is not satisfied
|
||||
"""
|
||||
if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0):
|
||||
return -1
|
||||
if conf.buffer[0] == 0:
|
||||
# here is the Root element
|
||||
return -1
|
||||
|
||||
idx_wi = conf.stack[len(conf.stack) - 1]
|
||||
|
||||
flag = True
|
||||
if self._algo == TransitionParser.ARC_EAGER:
|
||||
for idx_parent, r, idx_child in conf.arcs:
|
||||
if idx_child == idx_wi:
|
||||
flag = False
|
||||
|
||||
if flag:
|
||||
conf.stack.pop()
|
||||
idx_wj = conf.buffer[0]
|
||||
conf.arcs.append((idx_wj, relation, idx_wi))
|
||||
else:
|
||||
return -1
|
||||
|
||||
def right_arc(self, conf, relation):
|
||||
"""
|
||||
Note that the algorithm for right-arc is DIFFERENT for arc-standard and arc-eager
|
||||
|
||||
:param configuration: is the current configuration
|
||||
:return: A new configuration or -1 if the pre-condition is not satisfied
|
||||
"""
|
||||
if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0):
|
||||
return -1
|
||||
if self._algo == TransitionParser.ARC_STANDARD:
|
||||
idx_wi = conf.stack.pop()
|
||||
idx_wj = conf.buffer[0]
|
||||
conf.buffer[0] = idx_wi
|
||||
conf.arcs.append((idx_wi, relation, idx_wj))
|
||||
else: # arc-eager
|
||||
idx_wi = conf.stack[len(conf.stack) - 1]
|
||||
idx_wj = conf.buffer.pop(0)
|
||||
conf.stack.append(idx_wj)
|
||||
conf.arcs.append((idx_wi, relation, idx_wj))
|
||||
|
||||
def reduce(self, conf):
|
||||
"""
|
||||
Note that the algorithm for reduce is only available for arc-eager
|
||||
|
||||
:param configuration: is the current configuration
|
||||
:return: A new configuration or -1 if the pre-condition is not satisfied
|
||||
"""
|
||||
|
||||
if self._algo != TransitionParser.ARC_EAGER:
|
||||
return -1
|
||||
if len(conf.stack) <= 0:
|
||||
return -1
|
||||
|
||||
idx_wi = conf.stack[len(conf.stack) - 1]
|
||||
flag = False
|
||||
for idx_parent, r, idx_child in conf.arcs:
|
||||
if idx_child == idx_wi:
|
||||
flag = True
|
||||
if flag:
|
||||
conf.stack.pop() # reduce it
|
||||
else:
|
||||
return -1
|
||||
|
||||
def shift(self, conf):
|
||||
"""
|
||||
Note that the algorithm for shift is the SAME for arc-standard and arc-eager
|
||||
|
||||
:param configuration: is the current configuration
|
||||
:return: A new configuration or -1 if the pre-condition is not satisfied
|
||||
"""
|
||||
if len(conf.buffer) <= 0:
|
||||
return -1
|
||||
idx_wi = conf.buffer.pop(0)
|
||||
conf.stack.append(idx_wi)
|
||||
|
||||
|
||||
class TransitionParser(ParserI):
|
||||
"""
|
||||
Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager"
|
||||
"""
|
||||
|
||||
ARC_STANDARD = "arc-standard"
|
||||
ARC_EAGER = "arc-eager"
|
||||
|
||||
def __init__(self, algorithm):
|
||||
"""
|
||||
:param algorithm: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
|
||||
:type algorithm: str
|
||||
"""
|
||||
if not (algorithm in [self.ARC_STANDARD, self.ARC_EAGER]):
|
||||
raise ValueError(
|
||||
" Currently we only support %s and %s "
|
||||
% (self.ARC_STANDARD, self.ARC_EAGER)
|
||||
)
|
||||
self._algorithm = algorithm
|
||||
|
||||
self._dictionary = {}
|
||||
self._transition = {}
|
||||
self._match_transition = {}
|
||||
|
||||
def _get_dep_relation(self, idx_parent, idx_child, depgraph):
|
||||
p_node = depgraph.nodes[idx_parent]
|
||||
c_node = depgraph.nodes[idx_child]
|
||||
|
||||
if c_node["word"] is None:
|
||||
return None # Root word
|
||||
|
||||
if c_node["head"] == p_node["address"]:
|
||||
return c_node["rel"]
|
||||
else:
|
||||
return None
|
||||
|
||||
def _convert_to_binary_features(self, features):
|
||||
"""
|
||||
:param features: list of feature string which is needed to convert to binary features
|
||||
:type features: list(str)
|
||||
:return : string of binary features in libsvm format which is 'featureID:value' pairs
|
||||
"""
|
||||
unsorted_result = []
|
||||
for feature in features:
|
||||
self._dictionary.setdefault(feature, len(self._dictionary))
|
||||
unsorted_result.append(self._dictionary[feature])
|
||||
|
||||
# Default value of each feature is 1.0
|
||||
return " ".join(
|
||||
str(featureID) + ":1.0" for featureID in sorted(unsorted_result)
|
||||
)
|
||||
|
||||
def _is_projective(self, depgraph):
|
||||
arc_list = []
|
||||
for key in depgraph.nodes:
|
||||
node = depgraph.nodes[key]
|
||||
|
||||
if "head" in node:
|
||||
childIdx = node["address"]
|
||||
parentIdx = node["head"]
|
||||
if parentIdx is not None:
|
||||
arc_list.append((parentIdx, childIdx))
|
||||
|
||||
for parentIdx, childIdx in arc_list:
|
||||
# Ensure that childIdx < parentIdx
|
||||
if childIdx > parentIdx:
|
||||
temp = childIdx
|
||||
childIdx = parentIdx
|
||||
parentIdx = temp
|
||||
for k in range(childIdx + 1, parentIdx):
|
||||
for m in range(len(depgraph.nodes)):
|
||||
if (m < childIdx) or (m > parentIdx):
|
||||
if (k, m) in arc_list:
|
||||
return False
|
||||
if (m, k) in arc_list:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _write_to_file(self, key, binary_features, input_file):
|
||||
"""
|
||||
write the binary features to input file and update the transition dictionary
|
||||
"""
|
||||
self._transition.setdefault(key, len(self._transition) + 1)
|
||||
self._match_transition[self._transition[key]] = key
|
||||
|
||||
input_str = str(self._transition[key]) + " " + binary_features + "\n"
|
||||
input_file.write(input_str.encode("utf-8"))
|
||||
|
||||
def _create_training_examples_arc_std(self, depgraphs, input_file):
|
||||
"""
|
||||
Create the training example in the libsvm format and write it to the input_file.
|
||||
Reference : Page 32, Chapter 3. Dependency Parsing by Sandra Kubler, Ryan McDonal and Joakim Nivre (2009)
|
||||
"""
|
||||
operation = Transition(self.ARC_STANDARD)
|
||||
count_proj = 0
|
||||
training_seq = []
|
||||
|
||||
for depgraph in depgraphs:
|
||||
if not self._is_projective(depgraph):
|
||||
continue
|
||||
|
||||
count_proj += 1
|
||||
conf = Configuration(depgraph)
|
||||
while len(conf.buffer) > 0:
|
||||
b0 = conf.buffer[0]
|
||||
features = conf.extract_features()
|
||||
binary_features = self._convert_to_binary_features(features)
|
||||
|
||||
if len(conf.stack) > 0:
|
||||
s0 = conf.stack[len(conf.stack) - 1]
|
||||
# Left-arc operation
|
||||
rel = self._get_dep_relation(b0, s0, depgraph)
|
||||
if rel is not None:
|
||||
key = Transition.LEFT_ARC + ":" + rel
|
||||
self._write_to_file(key, binary_features, input_file)
|
||||
operation.left_arc(conf, rel)
|
||||
training_seq.append(key)
|
||||
continue
|
||||
|
||||
# Right-arc operation
|
||||
rel = self._get_dep_relation(s0, b0, depgraph)
|
||||
if rel is not None:
|
||||
precondition = True
|
||||
# Get the max-index of buffer
|
||||
maxID = conf._max_address
|
||||
|
||||
for w in range(maxID + 1):
|
||||
if w != b0:
|
||||
relw = self._get_dep_relation(b0, w, depgraph)
|
||||
if relw is not None:
|
||||
if (b0, relw, w) not in conf.arcs:
|
||||
precondition = False
|
||||
|
||||
if precondition:
|
||||
key = Transition.RIGHT_ARC + ":" + rel
|
||||
self._write_to_file(key, binary_features, input_file)
|
||||
operation.right_arc(conf, rel)
|
||||
training_seq.append(key)
|
||||
continue
|
||||
|
||||
# Shift operation as the default
|
||||
key = Transition.SHIFT
|
||||
self._write_to_file(key, binary_features, input_file)
|
||||
operation.shift(conf)
|
||||
training_seq.append(key)
|
||||
|
||||
print(" Number of training examples : " + str(len(depgraphs)))
|
||||
print(" Number of valid (projective) examples : " + str(count_proj))
|
||||
return training_seq
|
||||
|
||||
def _create_training_examples_arc_eager(self, depgraphs, input_file):
|
||||
"""
|
||||
Create the training example in the libsvm format and write it to the input_file.
|
||||
Reference : 'A Dynamic Oracle for Arc-Eager Dependency Parsing' by Joav Goldberg and Joakim Nivre
|
||||
"""
|
||||
operation = Transition(self.ARC_EAGER)
|
||||
countProj = 0
|
||||
training_seq = []
|
||||
|
||||
for depgraph in depgraphs:
|
||||
if not self._is_projective(depgraph):
|
||||
continue
|
||||
|
||||
countProj += 1
|
||||
conf = Configuration(depgraph)
|
||||
while len(conf.buffer) > 0:
|
||||
b0 = conf.buffer[0]
|
||||
features = conf.extract_features()
|
||||
binary_features = self._convert_to_binary_features(features)
|
||||
|
||||
if len(conf.stack) > 0:
|
||||
s0 = conf.stack[len(conf.stack) - 1]
|
||||
# Left-arc operation
|
||||
rel = self._get_dep_relation(b0, s0, depgraph)
|
||||
if rel is not None:
|
||||
key = Transition.LEFT_ARC + ":" + rel
|
||||
self._write_to_file(key, binary_features, input_file)
|
||||
operation.left_arc(conf, rel)
|
||||
training_seq.append(key)
|
||||
continue
|
||||
|
||||
# Right-arc operation
|
||||
rel = self._get_dep_relation(s0, b0, depgraph)
|
||||
if rel is not None:
|
||||
key = Transition.RIGHT_ARC + ":" + rel
|
||||
self._write_to_file(key, binary_features, input_file)
|
||||
operation.right_arc(conf, rel)
|
||||
training_seq.append(key)
|
||||
continue
|
||||
|
||||
# reduce operation
|
||||
flag = False
|
||||
for k in range(s0):
|
||||
if self._get_dep_relation(k, b0, depgraph) is not None:
|
||||
flag = True
|
||||
if self._get_dep_relation(b0, k, depgraph) is not None:
|
||||
flag = True
|
||||
if flag:
|
||||
key = Transition.REDUCE
|
||||
self._write_to_file(key, binary_features, input_file)
|
||||
operation.reduce(conf)
|
||||
training_seq.append(key)
|
||||
continue
|
||||
|
||||
# Shift operation as the default
|
||||
key = Transition.SHIFT
|
||||
self._write_to_file(key, binary_features, input_file)
|
||||
operation.shift(conf)
|
||||
training_seq.append(key)
|
||||
|
||||
print(" Number of training examples : " + str(len(depgraphs)))
|
||||
print(" Number of valid (projective) examples : " + str(countProj))
|
||||
return training_seq
|
||||
|
||||
def train(self, depgraphs, modelfile, verbose=True):
|
||||
"""
|
||||
:param depgraphs : list of DependencyGraph as the training data
|
||||
:type depgraphs : DependencyGraph
|
||||
:param modelfile : file name to save the trained model
|
||||
:type modelfile : str
|
||||
"""
|
||||
|
||||
try:
|
||||
input_file = tempfile.NamedTemporaryFile(
|
||||
prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False
|
||||
)
|
||||
|
||||
if self._algorithm == self.ARC_STANDARD:
|
||||
self._create_training_examples_arc_std(depgraphs, input_file)
|
||||
else:
|
||||
self._create_training_examples_arc_eager(depgraphs, input_file)
|
||||
|
||||
input_file.close()
|
||||
# Using the temporary file to train the libsvm classifier
|
||||
x_train, y_train = load_svmlight_file(input_file.name)
|
||||
# The parameter is set according to the paper:
|
||||
# Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
|
||||
# Todo : because of probability = True => very slow due to
|
||||
# cross-validation. Need to improve the speed here
|
||||
model = svm.SVC(
|
||||
kernel="poly",
|
||||
degree=2,
|
||||
coef0=0,
|
||||
gamma=0.2,
|
||||
C=0.5,
|
||||
verbose=verbose,
|
||||
probability=True,
|
||||
)
|
||||
|
||||
model.fit(x_train, y_train)
|
||||
# Save the model to file name (as pickle)
|
||||
pickle.dump(model, open(modelfile, "wb"))
|
||||
finally:
|
||||
remove(input_file.name)
|
||||
|
||||
def parse(self, depgraphs, modelFile):
|
||||
"""
|
||||
:param depgraphs: the list of test sentence, each sentence is represented as a dependency graph where the 'head' information is dummy
|
||||
:type depgraphs: list(DependencyGraph)
|
||||
:param modelfile: the model file
|
||||
:type modelfile: str
|
||||
:return: list (DependencyGraph) with the 'head' and 'rel' information
|
||||
"""
|
||||
result = []
|
||||
# First load the model
|
||||
model = pickle.load(open(modelFile, "rb"))
|
||||
operation = Transition(self._algorithm)
|
||||
|
||||
for depgraph in depgraphs:
|
||||
conf = Configuration(depgraph)
|
||||
while len(conf.buffer) > 0:
|
||||
features = conf.extract_features()
|
||||
col = []
|
||||
row = []
|
||||
data = []
|
||||
for feature in features:
|
||||
if feature in self._dictionary:
|
||||
col.append(self._dictionary[feature])
|
||||
row.append(0)
|
||||
data.append(1.0)
|
||||
np_col = array(sorted(col)) # NB : index must be sorted
|
||||
np_row = array(row)
|
||||
np_data = array(data)
|
||||
|
||||
x_test = sparse.csr_matrix(
|
||||
(np_data, (np_row, np_col)), shape=(1, len(self._dictionary))
|
||||
)
|
||||
|
||||
# It's best to use decision function as follow BUT it's not supported yet for sparse SVM
|
||||
# Using decision function to build the votes array
|
||||
# dec_func = model.decision_function(x_test)[0]
|
||||
# votes = {}
|
||||
# k = 0
|
||||
# for i in range(len(model.classes_)):
|
||||
# for j in range(i+1, len(model.classes_)):
|
||||
# #if dec_func[k] > 0:
|
||||
# votes.setdefault(i,0)
|
||||
# votes[i] +=1
|
||||
# else:
|
||||
# votes.setdefault(j,0)
|
||||
# votes[j] +=1
|
||||
# k +=1
|
||||
# Sort votes according to the values
|
||||
# sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True)
|
||||
|
||||
# We will use predict_proba instead of decision_function
|
||||
prob_dict = {}
|
||||
pred_prob = model.predict_proba(x_test)[0]
|
||||
for i in range(len(pred_prob)):
|
||||
prob_dict[i] = pred_prob[i]
|
||||
sorted_Prob = sorted(prob_dict.items(), key=itemgetter(1), reverse=True)
|
||||
|
||||
# Note that SHIFT is always a valid operation
|
||||
for y_pred_idx, confidence in sorted_Prob:
|
||||
# y_pred = model.predict(x_test)[0]
|
||||
# From the prediction match to the operation
|
||||
y_pred = model.classes_[y_pred_idx]
|
||||
|
||||
if y_pred in self._match_transition:
|
||||
strTransition = self._match_transition[y_pred]
|
||||
baseTransition = strTransition.split(":")[0]
|
||||
|
||||
if baseTransition == Transition.LEFT_ARC:
|
||||
if (
|
||||
operation.left_arc(conf, strTransition.split(":")[1])
|
||||
!= -1
|
||||
):
|
||||
break
|
||||
elif baseTransition == Transition.RIGHT_ARC:
|
||||
if (
|
||||
operation.right_arc(conf, strTransition.split(":")[1])
|
||||
!= -1
|
||||
):
|
||||
break
|
||||
elif baseTransition == Transition.REDUCE:
|
||||
if operation.reduce(conf) != -1:
|
||||
break
|
||||
elif baseTransition == Transition.SHIFT:
|
||||
if operation.shift(conf) != -1:
|
||||
break
|
||||
else:
|
||||
raise ValueError(
|
||||
"The predicted transition is not recognized, expected errors"
|
||||
)
|
||||
|
||||
# Finish with operations build the dependency graph from Conf.arcs
|
||||
|
||||
new_depgraph = deepcopy(depgraph)
|
||||
for key in new_depgraph.nodes:
|
||||
node = new_depgraph.nodes[key]
|
||||
node["rel"] = ""
|
||||
# With the default, all the token depend on the Root
|
||||
node["head"] = 0
|
||||
for head, rel, child in conf.arcs:
|
||||
c_node = new_depgraph.nodes[child]
|
||||
c_node["head"] = head
|
||||
c_node["rel"] = rel
|
||||
result.append(new_depgraph)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
>>> from nltk.parse import DependencyGraph, DependencyEvaluator
|
||||
>>> from nltk.parse.transitionparser import TransitionParser, Configuration, Transition
|
||||
>>> gold_sent = DependencyGraph(\"""
|
||||
... Economic JJ 2 ATT
|
||||
... news NN 3 SBJ
|
||||
... has VBD 0 ROOT
|
||||
... little JJ 5 ATT
|
||||
... effect NN 3 OBJ
|
||||
... on IN 5 ATT
|
||||
... financial JJ 8 ATT
|
||||
... markets NNS 6 PC
|
||||
... . . 3 PU
|
||||
... \""")
|
||||
|
||||
>>> conf = Configuration(gold_sent)
|
||||
|
||||
###################### Check the Initial Feature ########################
|
||||
|
||||
>>> print(', '.join(conf.extract_features()))
|
||||
STK_0_POS_TOP, BUF_0_FORM_Economic, BUF_0_LEMMA_Economic, BUF_0_POS_JJ, BUF_1_FORM_news, BUF_1_POS_NN, BUF_2_POS_VBD, BUF_3_POS_JJ
|
||||
|
||||
###################### Check The Transition #######################
|
||||
Check the Initialized Configuration
|
||||
>>> print(conf)
|
||||
Stack : [0] Buffer : [1, 2, 3, 4, 5, 6, 7, 8, 9] Arcs : []
|
||||
|
||||
A. Do some transition checks for ARC-STANDARD
|
||||
|
||||
>>> operation = Transition('arc-standard')
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf, "ATT")
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf,"SBJ")
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf, "ATT")
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf, "ATT")
|
||||
|
||||
Middle Configuration and Features Check
|
||||
>>> print(conf)
|
||||
Stack : [0, 3, 5, 6] Buffer : [8, 9] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7)]
|
||||
|
||||
>>> print(', '.join(conf.extract_features()))
|
||||
STK_0_FORM_on, STK_0_LEMMA_on, STK_0_POS_IN, STK_1_POS_NN, BUF_0_FORM_markets, BUF_0_LEMMA_markets, BUF_0_POS_NNS, BUF_1_FORM_., BUF_1_POS_., BUF_0_LDEP_ATT
|
||||
|
||||
>>> operation.right_arc(conf, "PC")
|
||||
>>> operation.right_arc(conf, "ATT")
|
||||
>>> operation.right_arc(conf, "OBJ")
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.right_arc(conf, "PU")
|
||||
>>> operation.right_arc(conf, "ROOT")
|
||||
>>> operation.shift(conf)
|
||||
|
||||
Terminated Configuration Check
|
||||
>>> print(conf)
|
||||
Stack : [0] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7), (6, 'PC', 8), (5, 'ATT', 6), (3, 'OBJ', 5), (3, 'PU', 9), (0, 'ROOT', 3)]
|
||||
|
||||
|
||||
B. Do some transition checks for ARC-EAGER
|
||||
|
||||
>>> conf = Configuration(gold_sent)
|
||||
>>> operation = Transition('arc-eager')
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf,'ATT')
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf,'SBJ')
|
||||
>>> operation.right_arc(conf,'ROOT')
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf,'ATT')
|
||||
>>> operation.right_arc(conf,'OBJ')
|
||||
>>> operation.right_arc(conf,'ATT')
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf,'ATT')
|
||||
>>> operation.right_arc(conf,'PC')
|
||||
>>> operation.reduce(conf)
|
||||
>>> operation.reduce(conf)
|
||||
>>> operation.reduce(conf)
|
||||
>>> operation.right_arc(conf,'PU')
|
||||
>>> print(conf)
|
||||
Stack : [0, 3, 9] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (0, 'ROOT', 3), (5, 'ATT', 4), (3, 'OBJ', 5), (5, 'ATT', 6), (8, 'ATT', 7), (6, 'PC', 8), (3, 'PU', 9)]
|
||||
|
||||
###################### Check The Training Function #######################
|
||||
|
||||
A. Check the ARC-STANDARD training
|
||||
>>> import tempfile
|
||||
>>> import os
|
||||
>>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False)
|
||||
|
||||
>>> parser_std = TransitionParser('arc-standard')
|
||||
>>> print(', '.join(parser_std._create_training_examples_arc_std([gold_sent], input_file)))
|
||||
Number of training examples : 1
|
||||
Number of valid (projective) examples : 1
|
||||
SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, SHIFT, SHIFT, LEFTARC:ATT, SHIFT, SHIFT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, RIGHTARC:ATT, RIGHTARC:OBJ, SHIFT, RIGHTARC:PU, RIGHTARC:ROOT, SHIFT
|
||||
|
||||
>>> parser_std.train([gold_sent],'temp.arcstd.model', verbose=False)
|
||||
Number of training examples : 1
|
||||
Number of valid (projective) examples : 1
|
||||
>>> input_file.close()
|
||||
>>> remove(input_file.name)
|
||||
|
||||
B. Check the ARC-EAGER training
|
||||
|
||||
>>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(),delete=False)
|
||||
>>> parser_eager = TransitionParser('arc-eager')
|
||||
>>> print(', '.join(parser_eager._create_training_examples_arc_eager([gold_sent], input_file)))
|
||||
Number of training examples : 1
|
||||
Number of valid (projective) examples : 1
|
||||
SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, RIGHTARC:ROOT, SHIFT, LEFTARC:ATT, RIGHTARC:OBJ, RIGHTARC:ATT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, REDUCE, REDUCE, REDUCE, RIGHTARC:PU
|
||||
|
||||
>>> parser_eager.train([gold_sent],'temp.arceager.model', verbose=False)
|
||||
Number of training examples : 1
|
||||
Number of valid (projective) examples : 1
|
||||
|
||||
>>> input_file.close()
|
||||
>>> remove(input_file.name)
|
||||
|
||||
###################### Check The Parsing Function ########################
|
||||
|
||||
A. Check the ARC-STANDARD parser
|
||||
|
||||
>>> result = parser_std.parse([gold_sent], 'temp.arcstd.model')
|
||||
>>> de = DependencyEvaluator(result, [gold_sent])
|
||||
>>> de.eval() >= (0, 0)
|
||||
True
|
||||
|
||||
B. Check the ARC-EAGER parser
|
||||
>>> result = parser_eager.parse([gold_sent], 'temp.arceager.model')
|
||||
>>> de = DependencyEvaluator(result, [gold_sent])
|
||||
>>> de.eval() >= (0, 0)
|
||||
True
|
||||
|
||||
Remove test temporary files
|
||||
>>> remove('temp.arceager.model')
|
||||
>>> remove('temp.arcstd.model')
|
||||
|
||||
Note that result is very poor because of only one training example.
|
||||
"""
|
||||
234
backend/venv/Lib/site-packages/nltk/parse/util.py
Normal file
234
backend/venv/Lib/site-packages/nltk/parse/util.py
Normal file
@@ -0,0 +1,234 @@
|
||||
# Natural Language Toolkit: Parser Utility Functions
|
||||
#
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# Tom Aarsen <>
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
"""
|
||||
Utility functions for parsers.
|
||||
"""
|
||||
|
||||
from nltk.data import load
|
||||
from nltk.grammar import CFG, PCFG, FeatureGrammar
|
||||
from nltk.parse.chart import Chart, ChartParser
|
||||
from nltk.parse.featurechart import FeatureChart, FeatureChartParser
|
||||
from nltk.parse.pchart import InsideChartParser
|
||||
|
||||
|
||||
def load_parser(
|
||||
grammar_url, trace=0, parser=None, chart_class=None, beam_size=0, **load_args
|
||||
):
|
||||
"""
|
||||
Load a grammar from a file, and build a parser based on that grammar.
|
||||
The parser depends on the grammar format, and might also depend
|
||||
on properties of the grammar itself.
|
||||
|
||||
The following grammar formats are currently supported:
|
||||
- ``'cfg'`` (CFGs: ``CFG``)
|
||||
- ``'pcfg'`` (probabilistic CFGs: ``PCFG``)
|
||||
- ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``)
|
||||
|
||||
:type grammar_url: str
|
||||
:param grammar_url: A URL specifying where the grammar is located.
|
||||
The default protocol is ``"nltk:"``, which searches for the file
|
||||
in the the NLTK data package.
|
||||
:type trace: int
|
||||
:param trace: The level of tracing that should be used when
|
||||
parsing a text. ``0`` will generate no tracing output;
|
||||
and higher numbers will produce more verbose tracing output.
|
||||
:param parser: The class used for parsing; should be ``ChartParser``
|
||||
or a subclass.
|
||||
If None, the class depends on the grammar format.
|
||||
:param chart_class: The class used for storing the chart;
|
||||
should be ``Chart`` or a subclass.
|
||||
Only used for CFGs and feature CFGs.
|
||||
If None, the chart class depends on the grammar format.
|
||||
:type beam_size: int
|
||||
:param beam_size: The maximum length for the parser's edge queue.
|
||||
Only used for probabilistic CFGs.
|
||||
:param load_args: Keyword parameters used when loading the grammar.
|
||||
See ``data.load`` for more information.
|
||||
"""
|
||||
grammar = load(grammar_url, **load_args)
|
||||
if not isinstance(grammar, CFG):
|
||||
raise ValueError("The grammar must be a CFG, " "or a subclass thereof.")
|
||||
if isinstance(grammar, PCFG):
|
||||
if parser is None:
|
||||
parser = InsideChartParser
|
||||
return parser(grammar, trace=trace, beam_size=beam_size)
|
||||
|
||||
elif isinstance(grammar, FeatureGrammar):
|
||||
if parser is None:
|
||||
parser = FeatureChartParser
|
||||
if chart_class is None:
|
||||
chart_class = FeatureChart
|
||||
return parser(grammar, trace=trace, chart_class=chart_class)
|
||||
|
||||
else: # Plain CFG.
|
||||
if parser is None:
|
||||
parser = ChartParser
|
||||
if chart_class is None:
|
||||
chart_class = Chart
|
||||
return parser(grammar, trace=trace, chart_class=chart_class)
|
||||
|
||||
|
||||
def taggedsent_to_conll(sentence):
|
||||
"""
|
||||
A module to convert a single POS tagged sentence into CONLL format.
|
||||
|
||||
>>> from nltk import word_tokenize, pos_tag
|
||||
>>> text = "This is a foobar sentence."
|
||||
>>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))): # doctest: +NORMALIZE_WHITESPACE
|
||||
... print(line, end="")
|
||||
1 This _ DT DT _ 0 a _ _
|
||||
2 is _ VBZ VBZ _ 0 a _ _
|
||||
3 a _ DT DT _ 0 a _ _
|
||||
4 foobar _ JJ JJ _ 0 a _ _
|
||||
5 sentence _ NN NN _ 0 a _ _
|
||||
6 . _ . . _ 0 a _ _
|
||||
|
||||
:param sentence: A single input sentence to parse
|
||||
:type sentence: list(tuple(str, str))
|
||||
:rtype: iter(str)
|
||||
:return: a generator yielding a single sentence in CONLL format.
|
||||
"""
|
||||
for i, (word, tag) in enumerate(sentence, start=1):
|
||||
input_str = [str(i), word, "_", tag, tag, "_", "0", "a", "_", "_"]
|
||||
input_str = "\t".join(input_str) + "\n"
|
||||
yield input_str
|
||||
|
||||
|
||||
def taggedsents_to_conll(sentences):
|
||||
"""
|
||||
A module to convert the a POS tagged document stream
|
||||
(i.e. list of list of tuples, a list of sentences) and yield lines
|
||||
in CONLL format. This module yields one line per word and two newlines
|
||||
for end of sentence.
|
||||
|
||||
>>> from nltk import word_tokenize, sent_tokenize, pos_tag
|
||||
>>> text = "This is a foobar sentence. Is that right?"
|
||||
>>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)]
|
||||
>>> for line in taggedsents_to_conll(sentences): # doctest: +NORMALIZE_WHITESPACE
|
||||
... if line:
|
||||
... print(line, end="")
|
||||
1 This _ DT DT _ 0 a _ _
|
||||
2 is _ VBZ VBZ _ 0 a _ _
|
||||
3 a _ DT DT _ 0 a _ _
|
||||
4 foobar _ JJ JJ _ 0 a _ _
|
||||
5 sentence _ NN NN _ 0 a _ _
|
||||
6 . _ . . _ 0 a _ _
|
||||
<BLANKLINE>
|
||||
<BLANKLINE>
|
||||
1 Is _ VBZ VBZ _ 0 a _ _
|
||||
2 that _ IN IN _ 0 a _ _
|
||||
3 right _ NN NN _ 0 a _ _
|
||||
4 ? _ . . _ 0 a _ _
|
||||
<BLANKLINE>
|
||||
<BLANKLINE>
|
||||
|
||||
:param sentences: Input sentences to parse
|
||||
:type sentence: list(list(tuple(str, str)))
|
||||
:rtype: iter(str)
|
||||
:return: a generator yielding sentences in CONLL format.
|
||||
"""
|
||||
for sentence in sentences:
|
||||
yield from taggedsent_to_conll(sentence)
|
||||
yield "\n\n"
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Test Suites
|
||||
######################################################################
|
||||
|
||||
|
||||
class TestGrammar:
|
||||
"""
|
||||
Unit tests for CFG.
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, suite, accept=None, reject=None):
|
||||
self.test_grammar = grammar
|
||||
|
||||
self.cp = load_parser(grammar, trace=0)
|
||||
self.suite = suite
|
||||
self._accept = accept
|
||||
self._reject = reject
|
||||
|
||||
def run(self, show_trees=False):
|
||||
"""
|
||||
Sentences in the test suite are divided into two classes:
|
||||
|
||||
- grammatical (``accept``) and
|
||||
- ungrammatical (``reject``).
|
||||
|
||||
If a sentence should parse according to the grammar, the value of
|
||||
``trees`` will be a non-empty list. If a sentence should be rejected
|
||||
according to the grammar, then the value of ``trees`` will be None.
|
||||
"""
|
||||
for test in self.suite:
|
||||
print(test["doc"] + ":", end=" ")
|
||||
for key in ["accept", "reject"]:
|
||||
for sent in test[key]:
|
||||
tokens = sent.split()
|
||||
trees = list(self.cp.parse(tokens))
|
||||
if show_trees and trees:
|
||||
print()
|
||||
print(sent)
|
||||
for tree in trees:
|
||||
print(tree)
|
||||
if key == "accept":
|
||||
if trees == []:
|
||||
raise ValueError("Sentence '%s' failed to parse'" % sent)
|
||||
else:
|
||||
accepted = True
|
||||
else:
|
||||
if trees:
|
||||
raise ValueError("Sentence '%s' received a parse'" % sent)
|
||||
else:
|
||||
rejected = True
|
||||
if accepted and rejected:
|
||||
print("All tests passed!")
|
||||
|
||||
|
||||
def extract_test_sentences(string, comment_chars="#%;", encoding=None):
|
||||
"""
|
||||
Parses a string with one test sentence per line.
|
||||
Lines can optionally begin with:
|
||||
|
||||
- a bool, saying if the sentence is grammatical or not, or
|
||||
- an int, giving the number of parse trees is should have,
|
||||
|
||||
The result information is followed by a colon, and then the sentence.
|
||||
Empty lines and lines beginning with a comment char are ignored.
|
||||
|
||||
:return: a list of tuple of sentences and expected results,
|
||||
where a sentence is a list of str,
|
||||
and a result is None, or bool, or int
|
||||
|
||||
:param comment_chars: ``str`` of possible comment characters.
|
||||
:param encoding: the encoding of the string, if it is binary
|
||||
"""
|
||||
if encoding is not None:
|
||||
string = string.decode(encoding)
|
||||
sentences = []
|
||||
for sentence in string.split("\n"):
|
||||
if sentence == "" or sentence[0] in comment_chars:
|
||||
continue
|
||||
split_info = sentence.split(":", 1)
|
||||
result = None
|
||||
if len(split_info) == 2:
|
||||
if split_info[0] in ["True", "true", "False", "false"]:
|
||||
result = split_info[0] in ["True", "true"]
|
||||
sentence = split_info[1]
|
||||
else:
|
||||
result = int(split_info[0])
|
||||
sentence = split_info[1]
|
||||
tokens = sentence.split()
|
||||
if tokens == []:
|
||||
continue
|
||||
sentences += [(tokens, result)]
|
||||
return sentences
|
||||
453
backend/venv/Lib/site-packages/nltk/parse/viterbi.py
Normal file
453
backend/venv/Lib/site-packages/nltk/parse/viterbi.py
Normal file
@@ -0,0 +1,453 @@
|
||||
# Natural Language Toolkit: Viterbi Probabilistic Parser
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from functools import reduce
|
||||
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.tree import ProbabilisticTree, Tree
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Viterbi PCFG Parser
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class ViterbiParser(ParserI):
|
||||
"""
|
||||
A bottom-up ``PCFG`` parser that uses dynamic programming to find
|
||||
the single most likely parse for a text. The ``ViterbiParser`` parser
|
||||
parses texts by filling in a "most likely constituent table".
|
||||
This table records the most probable tree representation for any
|
||||
given span and node value. In particular, it has an entry for
|
||||
every start index, end index, and node value, recording the most
|
||||
likely subtree that spans from the start index to the end index,
|
||||
and has the given node value.
|
||||
|
||||
The ``ViterbiParser`` parser fills in this table incrementally. It starts
|
||||
by filling in all entries for constituents that span one element
|
||||
of text (i.e., entries where the end index is one greater than the
|
||||
start index). After it has filled in all table entries for
|
||||
constituents that span one element of text, it fills in the
|
||||
entries for constitutants that span two elements of text. It
|
||||
continues filling in the entries for constituents spanning larger
|
||||
and larger portions of the text, until the entire table has been
|
||||
filled. Finally, it returns the table entry for a constituent
|
||||
spanning the entire text, whose node value is the grammar's start
|
||||
symbol.
|
||||
|
||||
In order to find the most likely constituent with a given span and
|
||||
node value, the ``ViterbiParser`` parser considers all productions that
|
||||
could produce that node value. For each production, it finds all
|
||||
children that collectively cover the span and have the node values
|
||||
specified by the production's right hand side. If the probability
|
||||
of the tree formed by applying the production to the children is
|
||||
greater than the probability of the current entry in the table,
|
||||
then the table is updated with this new tree.
|
||||
|
||||
A pseudo-code description of the algorithm used by
|
||||
``ViterbiParser`` is:
|
||||
|
||||
| Create an empty most likely constituent table, *MLC*.
|
||||
| For width in 1...len(text):
|
||||
| For start in 1...len(text)-width:
|
||||
| For prod in grammar.productions:
|
||||
| For each sequence of subtrees [t[1], t[2], ..., t[n]] in MLC,
|
||||
| where t[i].label()==prod.rhs[i],
|
||||
| and the sequence covers [start:start+width]:
|
||||
| old_p = MLC[start, start+width, prod.lhs]
|
||||
| new_p = P(t[1])P(t[1])...P(t[n])P(prod)
|
||||
| if new_p > old_p:
|
||||
| new_tree = Tree(prod.lhs, t[1], t[2], ..., t[n])
|
||||
| MLC[start, start+width, prod.lhs] = new_tree
|
||||
| Return MLC[0, len(text), start_symbol]
|
||||
|
||||
:type _grammar: PCFG
|
||||
:ivar _grammar: The grammar used to parse sentences.
|
||||
:type _trace: int
|
||||
:ivar _trace: The level of tracing output that should be generated
|
||||
when parsing a text.
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, trace=0):
|
||||
"""
|
||||
Create a new ``ViterbiParser`` parser, that uses ``grammar`` to
|
||||
parse texts.
|
||||
|
||||
:type grammar: PCFG
|
||||
:param grammar: The grammar used to parse texts.
|
||||
:type trace: int
|
||||
:param trace: The level of tracing that should be used when
|
||||
parsing a text. ``0`` will generate no tracing output;
|
||||
and higher numbers will produce more verbose tracing
|
||||
output.
|
||||
"""
|
||||
self._grammar = grammar
|
||||
self._trace = trace
|
||||
|
||||
def grammar(self):
|
||||
return self._grammar
|
||||
|
||||
def trace(self, trace=2):
|
||||
"""
|
||||
Set the level of tracing output that should be generated when
|
||||
parsing a text.
|
||||
|
||||
:type trace: int
|
||||
:param trace: The trace level. A trace level of ``0`` will
|
||||
generate no tracing output; and higher trace levels will
|
||||
produce more verbose tracing output.
|
||||
:rtype: None
|
||||
"""
|
||||
self._trace = trace
|
||||
|
||||
def parse(self, tokens):
|
||||
# Inherit docs from ParserI
|
||||
|
||||
tokens = list(tokens)
|
||||
self._grammar.check_coverage(tokens)
|
||||
|
||||
# The most likely constituent table. This table specifies the
|
||||
# most likely constituent for a given span and type.
|
||||
# Constituents can be either Trees or tokens. For Trees,
|
||||
# the "type" is the Nonterminal for the tree's root node
|
||||
# value. For Tokens, the "type" is the token's type.
|
||||
# The table is stored as a dictionary, since it is sparse.
|
||||
constituents = {}
|
||||
|
||||
# Initialize the constituents dictionary with the words from
|
||||
# the text.
|
||||
if self._trace:
|
||||
print("Inserting tokens into the most likely" + " constituents table...")
|
||||
for index in range(len(tokens)):
|
||||
token = tokens[index]
|
||||
constituents[index, index + 1, token] = token
|
||||
if self._trace > 1:
|
||||
self._trace_lexical_insertion(token, index, len(tokens))
|
||||
|
||||
# Consider each span of length 1, 2, ..., n; and add any trees
|
||||
# that might cover that span to the constituents dictionary.
|
||||
for length in range(1, len(tokens) + 1):
|
||||
if self._trace:
|
||||
print(
|
||||
"Finding the most likely constituents"
|
||||
+ " spanning %d text elements..." % length
|
||||
)
|
||||
for start in range(len(tokens) - length + 1):
|
||||
span = (start, start + length)
|
||||
self._add_constituents_spanning(span, constituents, tokens)
|
||||
|
||||
# Return the tree that spans the entire text & have the right cat
|
||||
tree = constituents.get((0, len(tokens), self._grammar.start()))
|
||||
if tree is not None:
|
||||
yield tree
|
||||
|
||||
def _add_constituents_spanning(self, span, constituents, tokens):
|
||||
"""
|
||||
Find any constituents that might cover ``span``, and add them
|
||||
to the most likely constituents table.
|
||||
|
||||
:rtype: None
|
||||
:type span: tuple(int, int)
|
||||
:param span: The section of the text for which we are
|
||||
trying to find possible constituents. The span is
|
||||
specified as a pair of integers, where the first integer
|
||||
is the index of the first token that should be included in
|
||||
the constituent; and the second integer is the index of
|
||||
the first token that should not be included in the
|
||||
constituent. I.e., the constituent should cover
|
||||
``text[span[0]:span[1]]``, where ``text`` is the text
|
||||
that we are parsing.
|
||||
|
||||
:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
|
||||
:param constituents: The most likely constituents table. This
|
||||
table records the most probable tree representation for
|
||||
any given span and node value. In particular,
|
||||
``constituents(s,e,nv)`` is the most likely
|
||||
``ProbabilisticTree`` that covers ``text[s:e]``
|
||||
and has a node value ``nv.symbol()``, where ``text``
|
||||
is the text that we are parsing. When
|
||||
``_add_constituents_spanning`` is called, ``constituents``
|
||||
should contain all possible constituents that are shorter
|
||||
than ``span``.
|
||||
|
||||
:type tokens: list of tokens
|
||||
:param tokens: The text we are parsing. This is only used for
|
||||
trace output.
|
||||
"""
|
||||
# Since some of the grammar productions may be unary, we need to
|
||||
# repeatedly try all of the productions until none of them add any
|
||||
# new constituents.
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
|
||||
# Find all ways instantiations of the grammar productions that
|
||||
# cover the span.
|
||||
instantiations = self._find_instantiations(span, constituents)
|
||||
|
||||
# For each production instantiation, add a new
|
||||
# ProbabilisticTree whose probability is the product
|
||||
# of the childrens' probabilities and the production's
|
||||
# probability.
|
||||
for production, children in instantiations:
|
||||
subtrees = [c for c in children if isinstance(c, Tree)]
|
||||
p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob())
|
||||
node = production.lhs().symbol()
|
||||
tree = ProbabilisticTree(node, children, prob=p)
|
||||
|
||||
# If it's new a constituent, then add it to the
|
||||
# constituents dictionary.
|
||||
c = constituents.get((span[0], span[1], production.lhs()))
|
||||
if self._trace > 1:
|
||||
if c is None or c != tree:
|
||||
if c is None or c.prob() < tree.prob():
|
||||
print(" Insert:", end=" ")
|
||||
else:
|
||||
print(" Discard:", end=" ")
|
||||
self._trace_production(production, p, span, len(tokens))
|
||||
if c is None or c.prob() < tree.prob():
|
||||
constituents[span[0], span[1], production.lhs()] = tree
|
||||
changed = True
|
||||
|
||||
def _find_instantiations(self, span, constituents):
|
||||
"""
|
||||
:return: a list of the production instantiations that cover a
|
||||
given span of the text. A "production instantiation" is
|
||||
a tuple containing a production and a list of children,
|
||||
where the production's right hand side matches the list of
|
||||
children; and the children cover ``span``. :rtype: list
|
||||
of ``pair`` of ``Production``, (list of
|
||||
(``ProbabilisticTree`` or token.
|
||||
|
||||
:type span: tuple(int, int)
|
||||
:param span: The section of the text for which we are
|
||||
trying to find production instantiations. The span is
|
||||
specified as a pair of integers, where the first integer
|
||||
is the index of the first token that should be covered by
|
||||
the production instantiation; and the second integer is
|
||||
the index of the first token that should not be covered by
|
||||
the production instantiation.
|
||||
:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
|
||||
:param constituents: The most likely constituents table. This
|
||||
table records the most probable tree representation for
|
||||
any given span and node value. See the module
|
||||
documentation for more information.
|
||||
"""
|
||||
rv = []
|
||||
for production in self._grammar.productions():
|
||||
childlists = self._match_rhs(production.rhs(), span, constituents)
|
||||
|
||||
for childlist in childlists:
|
||||
rv.append((production, childlist))
|
||||
return rv
|
||||
|
||||
def _match_rhs(self, rhs, span, constituents):
|
||||
"""
|
||||
:return: a set of all the lists of children that cover ``span``
|
||||
and that match ``rhs``.
|
||||
:rtype: list(list(ProbabilisticTree or token)
|
||||
|
||||
:type rhs: list(Nonterminal or any)
|
||||
:param rhs: The list specifying what kinds of children need to
|
||||
cover ``span``. Each nonterminal in ``rhs`` specifies
|
||||
that the corresponding child should be a tree whose node
|
||||
value is that nonterminal's symbol. Each terminal in ``rhs``
|
||||
specifies that the corresponding child should be a token
|
||||
whose type is that terminal.
|
||||
:type span: tuple(int, int)
|
||||
:param span: The section of the text for which we are
|
||||
trying to find child lists. The span is specified as a
|
||||
pair of integers, where the first integer is the index of
|
||||
the first token that should be covered by the child list;
|
||||
and the second integer is the index of the first token
|
||||
that should not be covered by the child list.
|
||||
:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
|
||||
:param constituents: The most likely constituents table. This
|
||||
table records the most probable tree representation for
|
||||
any given span and node value. See the module
|
||||
documentation for more information.
|
||||
"""
|
||||
(start, end) = span
|
||||
|
||||
# Base case
|
||||
if start >= end and rhs == ():
|
||||
return [[]]
|
||||
if start >= end or rhs == ():
|
||||
return []
|
||||
|
||||
# Find everything that matches the 1st symbol of the RHS
|
||||
childlists = []
|
||||
for split in range(start, end + 1):
|
||||
l = constituents.get((start, split, rhs[0]))
|
||||
if l is not None:
|
||||
rights = self._match_rhs(rhs[1:], (split, end), constituents)
|
||||
childlists += [[l] + r for r in rights]
|
||||
|
||||
return childlists
|
||||
|
||||
def _trace_production(self, production, p, span, width):
|
||||
"""
|
||||
Print trace output indicating that a given production has been
|
||||
applied at a given location.
|
||||
|
||||
:param production: The production that has been applied
|
||||
:type production: Production
|
||||
:param p: The probability of the tree produced by the production.
|
||||
:type p: float
|
||||
:param span: The span of the production
|
||||
:type span: tuple
|
||||
:rtype: None
|
||||
"""
|
||||
|
||||
str = "|" + "." * span[0]
|
||||
str += "=" * (span[1] - span[0])
|
||||
str += "." * (width - span[1]) + "| "
|
||||
str += "%s" % production
|
||||
if self._trace > 2:
|
||||
str = f"{str:<40} {p:12.10f} "
|
||||
|
||||
print(str)
|
||||
|
||||
def _trace_lexical_insertion(self, token, index, width):
|
||||
str = " Insert: |" + "." * index + "=" + "." * (width - index - 1) + "| "
|
||||
str += f"{token}"
|
||||
print(str)
|
||||
|
||||
def __repr__(self):
|
||||
return "<ViterbiParser for %r>" % self._grammar
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Test Code
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
A demonstration of the probabilistic parsers. The user is
|
||||
prompted to select which demo to run, and how many parses should
|
||||
be found; and then each parser is run on the same demo, and a
|
||||
summary of the results are displayed.
|
||||
"""
|
||||
import sys
|
||||
import time
|
||||
|
||||
from nltk import tokenize
|
||||
from nltk.grammar import PCFG
|
||||
from nltk.parse import ViterbiParser
|
||||
|
||||
toy_pcfg1 = PCFG.fromstring(
|
||||
"""
|
||||
S -> NP VP [1.0]
|
||||
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
|
||||
Det -> 'the' [0.8] | 'my' [0.2]
|
||||
N -> 'man' [0.5] | 'telescope' [0.5]
|
||||
VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
|
||||
V -> 'ate' [0.35] | 'saw' [0.65]
|
||||
PP -> P NP [1.0]
|
||||
P -> 'with' [0.61] | 'under' [0.39]
|
||||
"""
|
||||
)
|
||||
|
||||
toy_pcfg2 = PCFG.fromstring(
|
||||
"""
|
||||
S -> NP VP [1.0]
|
||||
VP -> V NP [.59]
|
||||
VP -> V [.40]
|
||||
VP -> VP PP [.01]
|
||||
NP -> Det N [.41]
|
||||
NP -> Name [.28]
|
||||
NP -> NP PP [.31]
|
||||
PP -> P NP [1.0]
|
||||
V -> 'saw' [.21]
|
||||
V -> 'ate' [.51]
|
||||
V -> 'ran' [.28]
|
||||
N -> 'boy' [.11]
|
||||
N -> 'cookie' [.12]
|
||||
N -> 'table' [.13]
|
||||
N -> 'telescope' [.14]
|
||||
N -> 'hill' [.5]
|
||||
Name -> 'Jack' [.52]
|
||||
Name -> 'Bob' [.48]
|
||||
P -> 'with' [.61]
|
||||
P -> 'under' [.39]
|
||||
Det -> 'the' [.41]
|
||||
Det -> 'a' [.31]
|
||||
Det -> 'my' [.28]
|
||||
"""
|
||||
)
|
||||
|
||||
# Define two demos. Each demo has a sentence and a grammar.
|
||||
demos = [
|
||||
("I saw the man with my telescope", toy_pcfg1),
|
||||
("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
|
||||
]
|
||||
|
||||
# Ask the user which demo they want to use.
|
||||
print()
|
||||
for i in range(len(demos)):
|
||||
print(f"{i + 1:>3}: {demos[i][0]}")
|
||||
print(" %r" % demos[i][1])
|
||||
print()
|
||||
print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
|
||||
try:
|
||||
snum = int(sys.stdin.readline().strip()) - 1
|
||||
sent, grammar = demos[snum]
|
||||
except:
|
||||
print("Bad sentence number")
|
||||
return
|
||||
|
||||
# Tokenize the sentence.
|
||||
tokens = sent.split()
|
||||
|
||||
parser = ViterbiParser(grammar)
|
||||
all_parses = {}
|
||||
|
||||
print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}")
|
||||
parser.trace(3)
|
||||
t = time.time()
|
||||
parses = parser.parse_all(tokens)
|
||||
time = time.time() - t
|
||||
average = (
|
||||
reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
|
||||
)
|
||||
num_parses = len(parses)
|
||||
for p in parses:
|
||||
all_parses[p.freeze()] = 1
|
||||
|
||||
# Print some summary statistics
|
||||
print()
|
||||
print("Time (secs) # Parses Average P(parse)")
|
||||
print("-----------------------------------------")
|
||||
print("%11.4f%11d%19.14f" % (time, num_parses, average))
|
||||
parses = all_parses.keys()
|
||||
if parses:
|
||||
p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
|
||||
else:
|
||||
p = 0
|
||||
print("------------------------------------------")
|
||||
print("%11s%11d%19.14f" % ("n/a", len(parses), p))
|
||||
|
||||
# Ask the user if we should draw the parses.
|
||||
print()
|
||||
print("Draw parses (y/n)? ", end=" ")
|
||||
if sys.stdin.readline().strip().lower().startswith("y"):
|
||||
from nltk.draw.tree import draw_trees
|
||||
|
||||
print(" please wait...")
|
||||
draw_trees(*parses)
|
||||
|
||||
# Ask the user if we should print the parses.
|
||||
print()
|
||||
print("Print parses (y/n)? ", end=" ")
|
||||
if sys.stdin.readline().strip().lower().startswith("y"):
|
||||
for parse in parses:
|
||||
print(parse)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
Reference in New Issue
Block a user