Initial commit
This commit is contained in:
101
backend/venv/Lib/site-packages/nltk/classify/__init__.py
Normal file
101
backend/venv/Lib/site-packages/nltk/classify/__init__.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# Natural Language Toolkit: Classifiers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Classes and interfaces for labeling tokens with category labels (or
|
||||
"class labels"). Typically, labels are represented with strings
|
||||
(such as ``'health'`` or ``'sports'``). Classifiers can be used to
|
||||
perform a wide range of classification tasks. For example,
|
||||
classifiers can be used...
|
||||
|
||||
- to classify documents by topic
|
||||
- to classify ambiguous words by which word sense is intended
|
||||
- to classify acoustic signals by which phoneme they represent
|
||||
- to classify sentences by their author
|
||||
|
||||
Features
|
||||
========
|
||||
In order to decide which category label is appropriate for a given
|
||||
token, classifiers examine one or more 'features' of the token. These
|
||||
"features" are typically chosen by hand, and indicate which aspects
|
||||
of the token are relevant to the classification decision. For
|
||||
example, a document classifier might use a separate feature for each
|
||||
word, recording how often that word occurred in the document.
|
||||
|
||||
Featuresets
|
||||
===========
|
||||
The features describing a token are encoded using a "featureset",
|
||||
which is a dictionary that maps from "feature names" to "feature
|
||||
values". Feature names are unique strings that indicate what aspect
|
||||
of the token is encoded by the feature. Examples include
|
||||
``'prevword'``, for a feature whose value is the previous word; and
|
||||
``'contains-word(library)'`` for a feature that is true when a document
|
||||
contains the word ``'library'``. Feature values are typically
|
||||
booleans, numbers, or strings, depending on which feature they
|
||||
describe.
|
||||
|
||||
Featuresets are typically constructed using a "feature detector"
|
||||
(also known as a "feature extractor"). A feature detector is a
|
||||
function that takes a token (and sometimes information about its
|
||||
context) as its input, and returns a featureset describing that token.
|
||||
For example, the following feature detector converts a document
|
||||
(stored as a list of words) to a featureset describing the set of
|
||||
words included in the document:
|
||||
|
||||
>>> # Define a feature detector function.
|
||||
>>> def document_features(document):
|
||||
... return dict([('contains-word(%s)' % w, True) for w in document])
|
||||
|
||||
Feature detectors are typically applied to each token before it is fed
|
||||
to the classifier:
|
||||
|
||||
>>> # Classify each Gutenberg document.
|
||||
>>> from nltk.corpus import gutenberg
|
||||
>>> for fileid in gutenberg.fileids(): # doctest: +SKIP
|
||||
... doc = gutenberg.words(fileid) # doctest: +SKIP
|
||||
... print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP
|
||||
|
||||
The parameters that a feature detector expects will vary, depending on
|
||||
the task and the needs of the feature detector. For example, a
|
||||
feature detector for word sense disambiguation (WSD) might take as its
|
||||
input a sentence, and the index of a word that should be classified,
|
||||
and return a featureset for that word. The following feature detector
|
||||
for WSD includes features describing the left and right contexts of
|
||||
the target word:
|
||||
|
||||
>>> def wsd_features(sentence, index):
|
||||
... featureset = {}
|
||||
... for i in range(max(0, index-3), index):
|
||||
... featureset['left-context(%s)' % sentence[i]] = True
|
||||
... for i in range(index, max(index+3, len(sentence))):
|
||||
... featureset['right-context(%s)' % sentence[i]] = True
|
||||
... return featureset
|
||||
|
||||
Training Classifiers
|
||||
====================
|
||||
Most classifiers are built by training them on a list of hand-labeled
|
||||
examples, known as the "training set". Training sets are represented
|
||||
as lists of ``(featuredict, label)`` tuples.
|
||||
"""
|
||||
|
||||
from nltk.classify.api import ClassifierI, MultiClassifierI
|
||||
from nltk.classify.decisiontree import DecisionTreeClassifier
|
||||
from nltk.classify.maxent import (
|
||||
BinaryMaxentFeatureEncoding,
|
||||
ConditionalExponentialClassifier,
|
||||
MaxentClassifier,
|
||||
TypedMaxentFeatureEncoding,
|
||||
)
|
||||
from nltk.classify.megam import call_megam, config_megam
|
||||
from nltk.classify.naivebayes import NaiveBayesClassifier
|
||||
from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier
|
||||
from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features
|
||||
from nltk.classify.scikitlearn import SklearnClassifier
|
||||
from nltk.classify.senna import Senna
|
||||
from nltk.classify.textcat import TextCat
|
||||
from nltk.classify.util import accuracy, apply_features, log_likelihood
|
||||
from nltk.classify.weka import WekaClassifier, config_weka
|
||||
195
backend/venv/Lib/site-packages/nltk/classify/api.py
Normal file
195
backend/venv/Lib/site-packages/nltk/classify/api.py
Normal file
@@ -0,0 +1,195 @@
|
||||
# Natural Language Toolkit: Classifier Interface
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Interfaces for labeling tokens with category labels (or "class labels").
|
||||
|
||||
``ClassifierI`` is a standard interface for "single-category
|
||||
classification", in which the set of categories is known, the number
|
||||
of categories is finite, and each text belongs to exactly one
|
||||
category.
|
||||
|
||||
``MultiClassifierI`` is a standard interface for "multi-category
|
||||
classification", which is like single-category classification except
|
||||
that each text belongs to zero or more categories.
|
||||
"""
|
||||
from nltk.internals import overridden
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
# { Classification Interfaces
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class ClassifierI:
|
||||
"""
|
||||
A processing interface for labeling tokens with a single category
|
||||
label (or "class"). Labels are typically strs or
|
||||
ints, but can be any immutable type. The set of labels
|
||||
that the classifier chooses from must be fixed and finite.
|
||||
|
||||
Subclasses must define:
|
||||
- ``labels()``
|
||||
- either ``classify()`` or ``classify_many()`` (or both)
|
||||
|
||||
Subclasses may define:
|
||||
- either ``prob_classify()`` or ``prob_classify_many()`` (or both)
|
||||
"""
|
||||
|
||||
def labels(self):
|
||||
"""
|
||||
:return: the list of category labels used by this classifier.
|
||||
:rtype: list of (immutable)
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def classify(self, featureset):
|
||||
"""
|
||||
:return: the most appropriate label for the given featureset.
|
||||
:rtype: label
|
||||
"""
|
||||
if overridden(self.classify_many):
|
||||
return self.classify_many([featureset])[0]
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def prob_classify(self, featureset):
|
||||
"""
|
||||
:return: a probability distribution over labels for the given
|
||||
featureset.
|
||||
:rtype: ProbDistI
|
||||
"""
|
||||
if overridden(self.prob_classify_many):
|
||||
return self.prob_classify_many([featureset])[0]
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def classify_many(self, featuresets):
|
||||
"""
|
||||
Apply ``self.classify()`` to each element of ``featuresets``. I.e.:
|
||||
|
||||
return [self.classify(fs) for fs in featuresets]
|
||||
|
||||
:rtype: list(label)
|
||||
"""
|
||||
return [self.classify(fs) for fs in featuresets]
|
||||
|
||||
def prob_classify_many(self, featuresets):
|
||||
"""
|
||||
Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.:
|
||||
|
||||
return [self.prob_classify(fs) for fs in featuresets]
|
||||
|
||||
:rtype: list(ProbDistI)
|
||||
"""
|
||||
return [self.prob_classify(fs) for fs in featuresets]
|
||||
|
||||
|
||||
class MultiClassifierI:
|
||||
"""
|
||||
A processing interface for labeling tokens with zero or more
|
||||
category labels (or "labels"). Labels are typically strs
|
||||
or ints, but can be any immutable type. The set of labels
|
||||
that the multi-classifier chooses from must be fixed and finite.
|
||||
|
||||
Subclasses must define:
|
||||
- ``labels()``
|
||||
- either ``classify()`` or ``classify_many()`` (or both)
|
||||
|
||||
Subclasses may define:
|
||||
- either ``prob_classify()`` or ``prob_classify_many()`` (or both)
|
||||
"""
|
||||
|
||||
def labels(self):
|
||||
"""
|
||||
:return: the list of category labels used by this classifier.
|
||||
:rtype: list of (immutable)
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def classify(self, featureset):
|
||||
"""
|
||||
:return: the most appropriate set of labels for the given featureset.
|
||||
:rtype: set(label)
|
||||
"""
|
||||
if overridden(self.classify_many):
|
||||
return self.classify_many([featureset])[0]
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def prob_classify(self, featureset):
|
||||
"""
|
||||
:return: a probability distribution over sets of labels for the
|
||||
given featureset.
|
||||
:rtype: ProbDistI
|
||||
"""
|
||||
if overridden(self.prob_classify_many):
|
||||
return self.prob_classify_many([featureset])[0]
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def classify_many(self, featuresets):
|
||||
"""
|
||||
Apply ``self.classify()`` to each element of ``featuresets``. I.e.:
|
||||
|
||||
return [self.classify(fs) for fs in featuresets]
|
||||
|
||||
:rtype: list(set(label))
|
||||
"""
|
||||
return [self.classify(fs) for fs in featuresets]
|
||||
|
||||
def prob_classify_many(self, featuresets):
|
||||
"""
|
||||
Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.:
|
||||
|
||||
return [self.prob_classify(fs) for fs in featuresets]
|
||||
|
||||
:rtype: list(ProbDistI)
|
||||
"""
|
||||
return [self.prob_classify(fs) for fs in featuresets]
|
||||
|
||||
|
||||
# # [XX] IN PROGRESS:
|
||||
# class SequenceClassifierI:
|
||||
# """
|
||||
# A processing interface for labeling sequences of tokens with a
|
||||
# single category label (or "class"). Labels are typically
|
||||
# strs or ints, but can be any immutable type. The set
|
||||
# of labels that the classifier chooses from must be fixed and
|
||||
# finite.
|
||||
# """
|
||||
# def labels(self):
|
||||
# """
|
||||
# :return: the list of category labels used by this classifier.
|
||||
# :rtype: list of (immutable)
|
||||
# """
|
||||
# raise NotImplementedError()
|
||||
|
||||
# def prob_classify(self, featureset):
|
||||
# """
|
||||
# Return a probability distribution over labels for the given
|
||||
# featureset.
|
||||
|
||||
# If ``featureset`` is a list of featuresets, then return a
|
||||
# corresponding list containing the probability distribution
|
||||
# over labels for each of the given featuresets, where the
|
||||
# *i*\ th element of this list is the most appropriate label for
|
||||
# the *i*\ th element of ``featuresets``.
|
||||
# """
|
||||
# raise NotImplementedError()
|
||||
|
||||
# def classify(self, featureset):
|
||||
# """
|
||||
# Return the most appropriate label for the given featureset.
|
||||
|
||||
# If ``featureset`` is a list of featuresets, then return a
|
||||
# corresponding list containing the most appropriate label for
|
||||
# each of the given featuresets, where the *i*\ th element of
|
||||
# this list is the most appropriate label for the *i*\ th element
|
||||
# of ``featuresets``.
|
||||
# """
|
||||
# raise NotImplementedError()
|
||||
349
backend/venv/Lib/site-packages/nltk/classify/decisiontree.py
Normal file
349
backend/venv/Lib/site-packages/nltk/classify/decisiontree.py
Normal file
@@ -0,0 +1,349 @@
|
||||
# Natural Language Toolkit: Decision Tree Classifiers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A classifier model that decides which label to assign to a token on
|
||||
the basis of a tree structure, where branches correspond to conditions
|
||||
on feature values, and leaves correspond to label assignments.
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.classify.api import ClassifierI
|
||||
from nltk.probability import FreqDist, MLEProbDist, entropy
|
||||
|
||||
|
||||
class DecisionTreeClassifier(ClassifierI):
|
||||
def __init__(self, label, feature_name=None, decisions=None, default=None):
|
||||
"""
|
||||
:param label: The most likely label for tokens that reach
|
||||
this node in the decision tree. If this decision tree
|
||||
has no children, then this label will be assigned to
|
||||
any token that reaches this decision tree.
|
||||
:param feature_name: The name of the feature that this
|
||||
decision tree selects for.
|
||||
:param decisions: A dictionary mapping from feature values
|
||||
for the feature identified by ``feature_name`` to
|
||||
child decision trees.
|
||||
:param default: The child that will be used if the value of
|
||||
feature ``feature_name`` does not match any of the keys in
|
||||
``decisions``. This is used when constructing binary
|
||||
decision trees.
|
||||
"""
|
||||
self._label = label
|
||||
self._fname = feature_name
|
||||
self._decisions = decisions
|
||||
self._default = default
|
||||
|
||||
def labels(self):
|
||||
labels = [self._label]
|
||||
if self._decisions is not None:
|
||||
for dt in self._decisions.values():
|
||||
labels.extend(dt.labels())
|
||||
if self._default is not None:
|
||||
labels.extend(self._default.labels())
|
||||
return list(set(labels))
|
||||
|
||||
def classify(self, featureset):
|
||||
# Decision leaf:
|
||||
if self._fname is None:
|
||||
return self._label
|
||||
|
||||
# Decision tree:
|
||||
fval = featureset.get(self._fname)
|
||||
if fval in self._decisions:
|
||||
return self._decisions[fval].classify(featureset)
|
||||
elif self._default is not None:
|
||||
return self._default.classify(featureset)
|
||||
else:
|
||||
return self._label
|
||||
|
||||
def error(self, labeled_featuresets):
|
||||
errors = 0
|
||||
for featureset, label in labeled_featuresets:
|
||||
if self.classify(featureset) != label:
|
||||
errors += 1
|
||||
return errors / len(labeled_featuresets)
|
||||
|
||||
def pretty_format(self, width=70, prefix="", depth=4):
|
||||
"""
|
||||
Return a string containing a pretty-printed version of this
|
||||
decision tree. Each line in this string corresponds to a
|
||||
single decision tree node or leaf, and indentation is used to
|
||||
display the structure of the decision tree.
|
||||
"""
|
||||
# [xx] display default!!
|
||||
if self._fname is None:
|
||||
n = width - len(prefix) - 15
|
||||
return "{}{} {}\n".format(prefix, "." * n, self._label)
|
||||
s = ""
|
||||
for i, (fval, result) in enumerate(
|
||||
sorted(
|
||||
self._decisions.items(),
|
||||
key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
|
||||
)
|
||||
):
|
||||
hdr = f"{prefix}{self._fname}={fval}? "
|
||||
n = width - 15 - len(hdr)
|
||||
s += "{}{} {}\n".format(hdr, "." * (n), result._label)
|
||||
if result._fname is not None and depth > 1:
|
||||
s += result.pretty_format(width, prefix + " ", depth - 1)
|
||||
if self._default is not None:
|
||||
n = width - len(prefix) - 21
|
||||
s += "{}else: {} {}\n".format(prefix, "." * n, self._default._label)
|
||||
if self._default._fname is not None and depth > 1:
|
||||
s += self._default.pretty_format(width, prefix + " ", depth - 1)
|
||||
return s
|
||||
|
||||
def pseudocode(self, prefix="", depth=4):
|
||||
"""
|
||||
Return a string representation of this decision tree that
|
||||
expresses the decisions it makes as a nested set of pseudocode
|
||||
if statements.
|
||||
"""
|
||||
if self._fname is None:
|
||||
return f"{prefix}return {self._label!r}\n"
|
||||
s = ""
|
||||
for fval, result in sorted(
|
||||
self._decisions.items(),
|
||||
key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
|
||||
):
|
||||
s += f"{prefix}if {self._fname} == {fval!r}: "
|
||||
if result._fname is not None and depth > 1:
|
||||
s += "\n" + result.pseudocode(prefix + " ", depth - 1)
|
||||
else:
|
||||
s += f"return {result._label!r}\n"
|
||||
if self._default is not None:
|
||||
if len(self._decisions) == 1:
|
||||
s += "{}if {} != {!r}: ".format(
|
||||
prefix, self._fname, list(self._decisions.keys())[0]
|
||||
)
|
||||
else:
|
||||
s += f"{prefix}else: "
|
||||
if self._default._fname is not None and depth > 1:
|
||||
s += "\n" + self._default.pseudocode(prefix + " ", depth - 1)
|
||||
else:
|
||||
s += f"return {self._default._label!r}\n"
|
||||
return s
|
||||
|
||||
def __str__(self):
|
||||
return self.pretty_format()
|
||||
|
||||
@staticmethod
|
||||
def train(
|
||||
labeled_featuresets,
|
||||
entropy_cutoff=0.05,
|
||||
depth_cutoff=100,
|
||||
support_cutoff=10,
|
||||
binary=False,
|
||||
feature_values=None,
|
||||
verbose=False,
|
||||
):
|
||||
"""
|
||||
:param binary: If true, then treat all feature/value pairs as
|
||||
individual binary features, rather than using a single n-way
|
||||
branch for each feature.
|
||||
"""
|
||||
# Collect a list of all feature names.
|
||||
feature_names = set()
|
||||
for featureset, label in labeled_featuresets:
|
||||
for fname in featureset:
|
||||
feature_names.add(fname)
|
||||
|
||||
# Collect a list of the values each feature can take.
|
||||
if feature_values is None and binary:
|
||||
feature_values = defaultdict(set)
|
||||
for featureset, label in labeled_featuresets:
|
||||
for fname, fval in featureset.items():
|
||||
feature_values[fname].add(fval)
|
||||
|
||||
# Start with a stump.
|
||||
if not binary:
|
||||
tree = DecisionTreeClassifier.best_stump(
|
||||
feature_names, labeled_featuresets, verbose
|
||||
)
|
||||
else:
|
||||
tree = DecisionTreeClassifier.best_binary_stump(
|
||||
feature_names, labeled_featuresets, feature_values, verbose
|
||||
)
|
||||
|
||||
# Refine the stump.
|
||||
tree.refine(
|
||||
labeled_featuresets,
|
||||
entropy_cutoff,
|
||||
depth_cutoff - 1,
|
||||
support_cutoff,
|
||||
binary,
|
||||
feature_values,
|
||||
verbose,
|
||||
)
|
||||
|
||||
# Return it
|
||||
return tree
|
||||
|
||||
@staticmethod
|
||||
def leaf(labeled_featuresets):
|
||||
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
|
||||
return DecisionTreeClassifier(label)
|
||||
|
||||
@staticmethod
|
||||
def stump(feature_name, labeled_featuresets):
|
||||
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
|
||||
|
||||
# Find the best label for each value.
|
||||
freqs = defaultdict(FreqDist) # freq(label|value)
|
||||
for featureset, label in labeled_featuresets:
|
||||
feature_value = featureset.get(feature_name)
|
||||
freqs[feature_value][label] += 1
|
||||
|
||||
decisions = {val: DecisionTreeClassifier(freqs[val].max()) for val in freqs}
|
||||
return DecisionTreeClassifier(label, feature_name, decisions)
|
||||
|
||||
def refine(
|
||||
self,
|
||||
labeled_featuresets,
|
||||
entropy_cutoff,
|
||||
depth_cutoff,
|
||||
support_cutoff,
|
||||
binary=False,
|
||||
feature_values=None,
|
||||
verbose=False,
|
||||
):
|
||||
if len(labeled_featuresets) <= support_cutoff:
|
||||
return
|
||||
if self._fname is None:
|
||||
return
|
||||
if depth_cutoff <= 0:
|
||||
return
|
||||
for fval in self._decisions:
|
||||
fval_featuresets = [
|
||||
(featureset, label)
|
||||
for (featureset, label) in labeled_featuresets
|
||||
if featureset.get(self._fname) == fval
|
||||
]
|
||||
|
||||
label_freqs = FreqDist(label for (featureset, label) in fval_featuresets)
|
||||
if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
|
||||
self._decisions[fval] = DecisionTreeClassifier.train(
|
||||
fval_featuresets,
|
||||
entropy_cutoff,
|
||||
depth_cutoff,
|
||||
support_cutoff,
|
||||
binary,
|
||||
feature_values,
|
||||
verbose,
|
||||
)
|
||||
if self._default is not None:
|
||||
default_featuresets = [
|
||||
(featureset, label)
|
||||
for (featureset, label) in labeled_featuresets
|
||||
if featureset.get(self._fname) not in self._decisions
|
||||
]
|
||||
label_freqs = FreqDist(label for (featureset, label) in default_featuresets)
|
||||
if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
|
||||
self._default = DecisionTreeClassifier.train(
|
||||
default_featuresets,
|
||||
entropy_cutoff,
|
||||
depth_cutoff,
|
||||
support_cutoff,
|
||||
binary,
|
||||
feature_values,
|
||||
verbose,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def best_stump(feature_names, labeled_featuresets, verbose=False):
|
||||
best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
|
||||
best_error = best_stump.error(labeled_featuresets)
|
||||
for fname in feature_names:
|
||||
stump = DecisionTreeClassifier.stump(fname, labeled_featuresets)
|
||||
stump_error = stump.error(labeled_featuresets)
|
||||
if stump_error < best_error:
|
||||
best_error = stump_error
|
||||
best_stump = stump
|
||||
if verbose:
|
||||
print(
|
||||
"best stump for {:6d} toks uses {:20} err={:6.4f}".format(
|
||||
len(labeled_featuresets), best_stump._fname, best_error
|
||||
)
|
||||
)
|
||||
return best_stump
|
||||
|
||||
@staticmethod
|
||||
def binary_stump(feature_name, feature_value, labeled_featuresets):
|
||||
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
|
||||
|
||||
# Find the best label for each value.
|
||||
pos_fdist = FreqDist()
|
||||
neg_fdist = FreqDist()
|
||||
for featureset, label in labeled_featuresets:
|
||||
if featureset.get(feature_name) == feature_value:
|
||||
pos_fdist[label] += 1
|
||||
else:
|
||||
neg_fdist[label] += 1
|
||||
|
||||
decisions = {}
|
||||
default = label
|
||||
# But hopefully we have observations!
|
||||
if pos_fdist.N() > 0:
|
||||
decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
|
||||
if neg_fdist.N() > 0:
|
||||
default = DecisionTreeClassifier(neg_fdist.max())
|
||||
|
||||
return DecisionTreeClassifier(label, feature_name, decisions, default)
|
||||
|
||||
@staticmethod
|
||||
def best_binary_stump(
|
||||
feature_names, labeled_featuresets, feature_values, verbose=False
|
||||
):
|
||||
best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
|
||||
best_error = best_stump.error(labeled_featuresets)
|
||||
for fname in feature_names:
|
||||
for fval in feature_values[fname]:
|
||||
stump = DecisionTreeClassifier.binary_stump(
|
||||
fname, fval, labeled_featuresets
|
||||
)
|
||||
stump_error = stump.error(labeled_featuresets)
|
||||
if stump_error < best_error:
|
||||
best_error = stump_error
|
||||
best_stump = stump
|
||||
if verbose:
|
||||
if best_stump._decisions:
|
||||
descr = "{}={}".format(
|
||||
best_stump._fname, list(best_stump._decisions.keys())[0]
|
||||
)
|
||||
else:
|
||||
descr = "(default)"
|
||||
print(
|
||||
"best stump for {:6d} toks uses {:20} err={:6.4f}".format(
|
||||
len(labeled_featuresets), descr, best_error
|
||||
)
|
||||
)
|
||||
return best_stump
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Demo
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def f(x):
|
||||
return DecisionTreeClassifier.train(x, binary=True, verbose=True)
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.classify.util import binary_names_demo_features, names_demo
|
||||
|
||||
classifier = names_demo(
|
||||
f, binary_names_demo_features # DecisionTreeClassifier.train,
|
||||
)
|
||||
print(classifier.pretty_format(depth=7))
|
||||
print(classifier.pseudocode(depth=7))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
1631
backend/venv/Lib/site-packages/nltk/classify/maxent.py
Normal file
1631
backend/venv/Lib/site-packages/nltk/classify/maxent.py
Normal file
File diff suppressed because it is too large
Load Diff
184
backend/venv/Lib/site-packages/nltk/classify/megam.py
Normal file
184
backend/venv/Lib/site-packages/nltk/classify/megam.py
Normal file
@@ -0,0 +1,184 @@
|
||||
# Natural Language Toolkit: Interface to Megam Classifier
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A set of functions used to interface with the external megam_ maxent
|
||||
optimization package. Before megam can be used, you should tell NLTK where it
|
||||
can find the megam binary, using the ``config_megam()`` function. Typical
|
||||
usage:
|
||||
|
||||
>>> from nltk.classify import megam
|
||||
>>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP
|
||||
[Found megam: ...]
|
||||
|
||||
Use with MaxentClassifier. Example below, see MaxentClassifier documentation
|
||||
for details.
|
||||
|
||||
nltk.classify.MaxentClassifier.train(corpus, 'megam')
|
||||
|
||||
.. _megam: https://www.umiacs.umd.edu/~hal/megam/index.html
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
from nltk.internals import find_binary
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
numpy = None
|
||||
|
||||
######################################################################
|
||||
# { Configuration
|
||||
######################################################################
|
||||
|
||||
_megam_bin = None
|
||||
|
||||
|
||||
def config_megam(bin=None):
|
||||
"""
|
||||
Configure NLTK's interface to the ``megam`` maxent optimization
|
||||
package.
|
||||
|
||||
:param bin: The full path to the ``megam`` binary. If not specified,
|
||||
then nltk will search the system for a ``megam`` binary; and if
|
||||
one is not found, it will raise a ``LookupError`` exception.
|
||||
:type bin: str
|
||||
"""
|
||||
global _megam_bin
|
||||
_megam_bin = find_binary(
|
||||
"megam",
|
||||
bin,
|
||||
env_vars=["MEGAM"],
|
||||
binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
|
||||
url="https://www.umiacs.umd.edu/~hal/megam/index.html",
|
||||
)
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Megam Interface Functions
|
||||
######################################################################
|
||||
|
||||
|
||||
def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True):
|
||||
"""
|
||||
Generate an input file for ``megam`` based on the given corpus of
|
||||
classified tokens.
|
||||
|
||||
:type train_toks: list(tuple(dict, str))
|
||||
:param train_toks: Training data, represented as a list of
|
||||
pairs, the first member of which is a feature dictionary,
|
||||
and the second of which is a classification label.
|
||||
|
||||
:type encoding: MaxentFeatureEncodingI
|
||||
:param encoding: A feature encoding, used to convert featuresets
|
||||
into feature vectors. May optionally implement a cost() method
|
||||
in order to assign different costs to different class predictions.
|
||||
|
||||
:type stream: stream
|
||||
:param stream: The stream to which the megam input file should be
|
||||
written.
|
||||
|
||||
:param bernoulli: If true, then use the 'bernoulli' format. I.e.,
|
||||
all joint features have binary values, and are listed iff they
|
||||
are true. Otherwise, list feature values explicitly. If
|
||||
``bernoulli=False``, then you must call ``megam`` with the
|
||||
``-fvals`` option.
|
||||
|
||||
:param explicit: If true, then use the 'explicit' format. I.e.,
|
||||
list the features that would fire for any of the possible
|
||||
labels, for each token. If ``explicit=True``, then you must
|
||||
call ``megam`` with the ``-explicit`` option.
|
||||
"""
|
||||
# Look up the set of labels.
|
||||
labels = encoding.labels()
|
||||
labelnum = {label: i for (i, label) in enumerate(labels)}
|
||||
|
||||
# Write the file, which contains one line per instance.
|
||||
for featureset, label in train_toks:
|
||||
# First, the instance number (or, in the weighted multiclass case, the cost of each label).
|
||||
if hasattr(encoding, "cost"):
|
||||
stream.write(
|
||||
":".join(str(encoding.cost(featureset, label, l)) for l in labels)
|
||||
)
|
||||
else:
|
||||
stream.write("%d" % labelnum[label])
|
||||
|
||||
# For implicit file formats, just list the features that fire
|
||||
# for this instance's actual label.
|
||||
if not explicit:
|
||||
_write_megam_features(encoding.encode(featureset, label), stream, bernoulli)
|
||||
|
||||
# For explicit formats, list the features that would fire for
|
||||
# any of the possible labels.
|
||||
else:
|
||||
for l in labels:
|
||||
stream.write(" #")
|
||||
_write_megam_features(encoding.encode(featureset, l), stream, bernoulli)
|
||||
|
||||
# End of the instance.
|
||||
stream.write("\n")
|
||||
|
||||
|
||||
def parse_megam_weights(s, features_count, explicit=True):
|
||||
"""
|
||||
Given the stdout output generated by ``megam`` when training a
|
||||
model, return a ``numpy`` array containing the corresponding weight
|
||||
vector. This function does not currently handle bias features.
|
||||
"""
|
||||
if numpy is None:
|
||||
raise ValueError("This function requires that numpy be installed")
|
||||
assert explicit, "non-explicit not supported yet"
|
||||
lines = s.strip().split("\n")
|
||||
weights = numpy.zeros(features_count, "d")
|
||||
for line in lines:
|
||||
if line.strip():
|
||||
fid, weight = line.split()
|
||||
weights[int(fid)] = float(weight)
|
||||
return weights
|
||||
|
||||
|
||||
def _write_megam_features(vector, stream, bernoulli):
|
||||
if not vector:
|
||||
raise ValueError(
|
||||
"MEGAM classifier requires the use of an " "always-on feature."
|
||||
)
|
||||
for fid, fval in vector:
|
||||
if bernoulli:
|
||||
if fval == 1:
|
||||
stream.write(" %s" % fid)
|
||||
elif fval != 0:
|
||||
raise ValueError(
|
||||
"If bernoulli=True, then all" "features must be binary."
|
||||
)
|
||||
else:
|
||||
stream.write(f" {fid} {fval}")
|
||||
|
||||
|
||||
def call_megam(args):
|
||||
"""
|
||||
Call the ``megam`` binary with the given arguments.
|
||||
"""
|
||||
if isinstance(args, str):
|
||||
raise TypeError("args should be a list of strings")
|
||||
if _megam_bin is None:
|
||||
config_megam()
|
||||
|
||||
# Call megam via a subprocess
|
||||
cmd = [_megam_bin] + args
|
||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
|
||||
(stdout, stderr) = p.communicate()
|
||||
|
||||
# Check the return code.
|
||||
if p.returncode != 0:
|
||||
print()
|
||||
print(stderr)
|
||||
raise OSError("megam command failed!")
|
||||
|
||||
if isinstance(stdout, str):
|
||||
return stdout
|
||||
else:
|
||||
return stdout.decode("utf-8")
|
||||
260
backend/venv/Lib/site-packages/nltk/classify/naivebayes.py
Normal file
260
backend/venv/Lib/site-packages/nltk/classify/naivebayes.py
Normal file
@@ -0,0 +1,260 @@
|
||||
# Natural Language Toolkit: Naive Bayes Classifiers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A classifier based on the Naive Bayes algorithm. In order to find the
|
||||
probability for a label, this algorithm first uses the Bayes rule to
|
||||
express P(label|features) in terms of P(label) and P(features|label):
|
||||
|
||||
| P(label) * P(features|label)
|
||||
| P(label|features) = ------------------------------
|
||||
| P(features)
|
||||
|
||||
The algorithm then makes the 'naive' assumption that all features are
|
||||
independent, given the label:
|
||||
|
||||
| P(label) * P(f1|label) * ... * P(fn|label)
|
||||
| P(label|features) = --------------------------------------------
|
||||
| P(features)
|
||||
|
||||
Rather than computing P(features) explicitly, the algorithm just
|
||||
calculates the numerator for each label, and normalizes them so they
|
||||
sum to one:
|
||||
|
||||
| P(label) * P(f1|label) * ... * P(fn|label)
|
||||
| P(label|features) = --------------------------------------------
|
||||
| SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.classify.api import ClassifierI
|
||||
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Naive Bayes Classifier
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class NaiveBayesClassifier(ClassifierI):
|
||||
"""
|
||||
A Naive Bayes classifier. Naive Bayes classifiers are
|
||||
paramaterized by two probability distributions:
|
||||
|
||||
- P(label) gives the probability that an input will receive each
|
||||
label, given no information about the input's features.
|
||||
|
||||
- P(fname=fval|label) gives the probability that a given feature
|
||||
(fname) will receive a given value (fval), given that the
|
||||
label (label).
|
||||
|
||||
If the classifier encounters an input with a feature that has
|
||||
never been seen with any label, then rather than assigning a
|
||||
probability of 0 to all labels, it will ignore that feature.
|
||||
|
||||
The feature value 'None' is reserved for unseen feature values;
|
||||
you generally should not use 'None' as a feature value for one of
|
||||
your own features.
|
||||
"""
|
||||
|
||||
def __init__(self, label_probdist, feature_probdist):
|
||||
"""
|
||||
:param label_probdist: P(label), the probability distribution
|
||||
over labels. It is expressed as a ``ProbDistI`` whose
|
||||
samples are labels. I.e., P(label) =
|
||||
``label_probdist.prob(label)``.
|
||||
|
||||
:param feature_probdist: P(fname=fval|label), the probability
|
||||
distribution for feature values, given labels. It is
|
||||
expressed as a dictionary whose keys are ``(label, fname)``
|
||||
pairs and whose values are ``ProbDistI`` objects over feature
|
||||
values. I.e., P(fname=fval|label) =
|
||||
``feature_probdist[label,fname].prob(fval)``. If a given
|
||||
``(label,fname)`` is not a key in ``feature_probdist``, then
|
||||
it is assumed that the corresponding P(fname=fval|label)
|
||||
is 0 for all values of ``fval``.
|
||||
"""
|
||||
self._label_probdist = label_probdist
|
||||
self._feature_probdist = feature_probdist
|
||||
self._labels = list(label_probdist.samples())
|
||||
|
||||
def labels(self):
|
||||
return self._labels
|
||||
|
||||
def classify(self, featureset):
|
||||
return self.prob_classify(featureset).max()
|
||||
|
||||
def prob_classify(self, featureset):
|
||||
# Discard any feature names that we've never seen before.
|
||||
# Otherwise, we'll just assign a probability of 0 to
|
||||
# everything.
|
||||
featureset = featureset.copy()
|
||||
for fname in list(featureset.keys()):
|
||||
for label in self._labels:
|
||||
if (label, fname) in self._feature_probdist:
|
||||
break
|
||||
else:
|
||||
# print('Ignoring unseen feature %s' % fname)
|
||||
del featureset[fname]
|
||||
|
||||
# Find the log probability of each label, given the features.
|
||||
# Start with the log probability of the label itself.
|
||||
logprob = {}
|
||||
for label in self._labels:
|
||||
logprob[label] = self._label_probdist.logprob(label)
|
||||
|
||||
# Then add in the log probability of features given labels.
|
||||
for label in self._labels:
|
||||
for fname, fval in featureset.items():
|
||||
if (label, fname) in self._feature_probdist:
|
||||
feature_probs = self._feature_probdist[label, fname]
|
||||
logprob[label] += feature_probs.logprob(fval)
|
||||
else:
|
||||
# nb: This case will never come up if the
|
||||
# classifier was created by
|
||||
# NaiveBayesClassifier.train().
|
||||
logprob[label] += sum_logs([]) # = -INF.
|
||||
|
||||
return DictionaryProbDist(logprob, normalize=True, log=True)
|
||||
|
||||
def show_most_informative_features(self, n=10):
|
||||
# Determine the most relevant features, and display them.
|
||||
cpdist = self._feature_probdist
|
||||
print("Most Informative Features")
|
||||
|
||||
for fname, fval in self.most_informative_features(n):
|
||||
|
||||
def labelprob(l):
|
||||
return cpdist[l, fname].prob(fval)
|
||||
|
||||
labels = sorted(
|
||||
(l for l in self._labels if fval in cpdist[l, fname].samples()),
|
||||
key=lambda element: (-labelprob(element), element),
|
||||
reverse=True,
|
||||
)
|
||||
if len(labels) == 1:
|
||||
continue
|
||||
l0 = labels[0]
|
||||
l1 = labels[-1]
|
||||
if cpdist[l0, fname].prob(fval) == 0:
|
||||
ratio = "INF"
|
||||
else:
|
||||
ratio = "%8.1f" % (
|
||||
cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
|
||||
)
|
||||
print(
|
||||
"%24s = %-14r %6s : %-6s = %s : 1.0"
|
||||
% (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
|
||||
)
|
||||
|
||||
def most_informative_features(self, n=100):
|
||||
"""
|
||||
Return a list of the 'most informative' features used by this
|
||||
classifier. For the purpose of this function, the
|
||||
informativeness of a feature ``(fname,fval)`` is equal to the
|
||||
highest value of P(fname=fval|label), for any label, divided by
|
||||
the lowest value of P(fname=fval|label), for any label:
|
||||
|
||||
| max[ P(fname=fval|label1) / P(fname=fval|label2) ]
|
||||
"""
|
||||
if hasattr(self, "_most_informative_features"):
|
||||
return self._most_informative_features[:n]
|
||||
else:
|
||||
# The set of (fname, fval) pairs used by this classifier.
|
||||
features = set()
|
||||
# The max & min probability associated w/ each (fname, fval)
|
||||
# pair. Maps (fname,fval) -> float.
|
||||
maxprob = defaultdict(float)
|
||||
minprob = defaultdict(lambda: 1.0)
|
||||
|
||||
for (label, fname), probdist in self._feature_probdist.items():
|
||||
for fval in probdist.samples():
|
||||
feature = (fname, fval)
|
||||
features.add(feature)
|
||||
p = probdist.prob(fval)
|
||||
maxprob[feature] = max(p, maxprob[feature])
|
||||
minprob[feature] = min(p, minprob[feature])
|
||||
if minprob[feature] == 0:
|
||||
features.discard(feature)
|
||||
|
||||
# Convert features to a list, & sort it by how informative
|
||||
# features are.
|
||||
self._most_informative_features = sorted(
|
||||
features,
|
||||
key=lambda feature_: (
|
||||
minprob[feature_] / maxprob[feature_],
|
||||
feature_[0],
|
||||
feature_[1] in [None, False, True],
|
||||
str(feature_[1]).lower(),
|
||||
),
|
||||
)
|
||||
return self._most_informative_features[:n]
|
||||
|
||||
@classmethod
|
||||
def train(cls, labeled_featuresets, estimator=ELEProbDist):
|
||||
"""
|
||||
:param labeled_featuresets: A list of classified featuresets,
|
||||
i.e., a list of tuples ``(featureset, label)``.
|
||||
"""
|
||||
label_freqdist = FreqDist()
|
||||
feature_freqdist = defaultdict(FreqDist)
|
||||
feature_values = defaultdict(set)
|
||||
fnames = set()
|
||||
|
||||
# Count up how many times each feature value occurred, given
|
||||
# the label and featurename.
|
||||
for featureset, label in labeled_featuresets:
|
||||
label_freqdist[label] += 1
|
||||
for fname, fval in featureset.items():
|
||||
# Increment freq(fval|label, fname)
|
||||
feature_freqdist[label, fname][fval] += 1
|
||||
# Record that fname can take the value fval.
|
||||
feature_values[fname].add(fval)
|
||||
# Keep a list of all feature names.
|
||||
fnames.add(fname)
|
||||
|
||||
# If a feature didn't have a value given for an instance, then
|
||||
# we assume that it gets the implicit value 'None.' This loop
|
||||
# counts up the number of 'missing' feature values for each
|
||||
# (label,fname) pair, and increments the count of the fval
|
||||
# 'None' by that amount.
|
||||
for label in label_freqdist:
|
||||
num_samples = label_freqdist[label]
|
||||
for fname in fnames:
|
||||
count = feature_freqdist[label, fname].N()
|
||||
# Only add a None key when necessary, i.e. if there are
|
||||
# any samples with feature 'fname' missing.
|
||||
if num_samples - count > 0:
|
||||
feature_freqdist[label, fname][None] += num_samples - count
|
||||
feature_values[fname].add(None)
|
||||
|
||||
# Create the P(label) distribution
|
||||
label_probdist = estimator(label_freqdist)
|
||||
|
||||
# Create the P(fval|label, fname) distribution
|
||||
feature_probdist = {}
|
||||
for (label, fname), freqdist in feature_freqdist.items():
|
||||
probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
||||
feature_probdist[label, fname] = probdist
|
||||
|
||||
return cls(label_probdist, feature_probdist)
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Demo
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.classify.util import names_demo
|
||||
|
||||
classifier = names_demo(NaiveBayesClassifier.train)
|
||||
classifier.show_most_informative_features()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
@@ -0,0 +1,180 @@
|
||||
# Natural Language Toolkit: Positive Naive Bayes Classifier
|
||||
#
|
||||
# Copyright (C) 2012 NLTK Project
|
||||
# Author: Alessandro Presta <alessandro.presta@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A variant of the Naive Bayes Classifier that performs binary classification with
|
||||
partially-labeled training sets. In other words, assume we want to build a classifier
|
||||
that assigns each example to one of two complementary classes (e.g., male names and
|
||||
female names).
|
||||
If we have a training set with labeled examples for both classes, we can use a
|
||||
standard Naive Bayes Classifier. However, consider the case when we only have labeled
|
||||
examples for one of the classes, and other, unlabeled, examples.
|
||||
Then, assuming a prior distribution on the two labels, we can use the unlabeled set
|
||||
to estimate the frequencies of the various features.
|
||||
|
||||
Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1
|
||||
and unlabeled examples. We are also given an estimate of P(1).
|
||||
|
||||
We compute P(feature|1) exactly as in the standard case.
|
||||
|
||||
To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are
|
||||
assuming that the unlabeled examples are drawn according to the given prior distribution)
|
||||
and then express the conditional probability as:
|
||||
|
||||
| P(feature) - P(feature|1) * P(1)
|
||||
| P(feature|0) = ----------------------------------
|
||||
| P(0)
|
||||
|
||||
Example:
|
||||
|
||||
>>> from nltk.classify import PositiveNaiveBayesClassifier
|
||||
|
||||
Some sentences about sports:
|
||||
|
||||
>>> sports_sentences = [ 'The team dominated the game',
|
||||
... 'They lost the ball',
|
||||
... 'The game was intense',
|
||||
... 'The goalkeeper catched the ball',
|
||||
... 'The other team controlled the ball' ]
|
||||
|
||||
Mixed topics, including sports:
|
||||
|
||||
>>> various_sentences = [ 'The President did not comment',
|
||||
... 'I lost the keys',
|
||||
... 'The team won the game',
|
||||
... 'Sara has two kids',
|
||||
... 'The ball went off the court',
|
||||
... 'They had the ball for the whole game',
|
||||
... 'The show is over' ]
|
||||
|
||||
The features of a sentence are simply the words it contains:
|
||||
|
||||
>>> def features(sentence):
|
||||
... words = sentence.lower().split()
|
||||
... return dict(('contains(%s)' % w, True) for w in words)
|
||||
|
||||
We use the sports sentences as positive examples, the mixed ones ad unlabeled examples:
|
||||
|
||||
>>> positive_featuresets = map(features, sports_sentences)
|
||||
>>> unlabeled_featuresets = map(features, various_sentences)
|
||||
>>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
|
||||
... unlabeled_featuresets)
|
||||
|
||||
Is the following sentence about sports?
|
||||
|
||||
>>> classifier.classify(features('The cat is on the table'))
|
||||
False
|
||||
|
||||
What about this one?
|
||||
|
||||
>>> classifier.classify(features('My team lost the game'))
|
||||
True
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.classify.naivebayes import NaiveBayesClassifier
|
||||
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Positive Naive Bayes Classifier
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class PositiveNaiveBayesClassifier(NaiveBayesClassifier):
|
||||
@staticmethod
|
||||
def train(
|
||||
positive_featuresets,
|
||||
unlabeled_featuresets,
|
||||
positive_prob_prior=0.5,
|
||||
estimator=ELEProbDist,
|
||||
):
|
||||
"""
|
||||
:param positive_featuresets: An iterable of featuresets that are known as positive
|
||||
examples (i.e., their label is ``True``).
|
||||
|
||||
:param unlabeled_featuresets: An iterable of featuresets whose label is unknown.
|
||||
|
||||
:param positive_prob_prior: A prior estimate of the probability of the label
|
||||
``True`` (default 0.5).
|
||||
"""
|
||||
positive_feature_freqdist = defaultdict(FreqDist)
|
||||
unlabeled_feature_freqdist = defaultdict(FreqDist)
|
||||
feature_values = defaultdict(set)
|
||||
fnames = set()
|
||||
|
||||
# Count up how many times each feature value occurred in positive examples.
|
||||
num_positive_examples = 0
|
||||
for featureset in positive_featuresets:
|
||||
for fname, fval in featureset.items():
|
||||
positive_feature_freqdist[fname][fval] += 1
|
||||
feature_values[fname].add(fval)
|
||||
fnames.add(fname)
|
||||
num_positive_examples += 1
|
||||
|
||||
# Count up how many times each feature value occurred in unlabeled examples.
|
||||
num_unlabeled_examples = 0
|
||||
for featureset in unlabeled_featuresets:
|
||||
for fname, fval in featureset.items():
|
||||
unlabeled_feature_freqdist[fname][fval] += 1
|
||||
feature_values[fname].add(fval)
|
||||
fnames.add(fname)
|
||||
num_unlabeled_examples += 1
|
||||
|
||||
# If a feature didn't have a value given for an instance, then we assume that
|
||||
# it gets the implicit value 'None'.
|
||||
for fname in fnames:
|
||||
count = positive_feature_freqdist[fname].N()
|
||||
positive_feature_freqdist[fname][None] += num_positive_examples - count
|
||||
feature_values[fname].add(None)
|
||||
|
||||
for fname in fnames:
|
||||
count = unlabeled_feature_freqdist[fname].N()
|
||||
unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count
|
||||
feature_values[fname].add(None)
|
||||
|
||||
negative_prob_prior = 1.0 - positive_prob_prior
|
||||
|
||||
# Create the P(label) distribution.
|
||||
label_probdist = DictionaryProbDist(
|
||||
{True: positive_prob_prior, False: negative_prob_prior}
|
||||
)
|
||||
|
||||
# Create the P(fval|label, fname) distribution.
|
||||
feature_probdist = {}
|
||||
for fname, freqdist in positive_feature_freqdist.items():
|
||||
probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
||||
feature_probdist[True, fname] = probdist
|
||||
|
||||
for fname, freqdist in unlabeled_feature_freqdist.items():
|
||||
global_probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
||||
negative_feature_probs = {}
|
||||
for fval in feature_values[fname]:
|
||||
prob = (
|
||||
global_probdist.prob(fval)
|
||||
- positive_prob_prior * feature_probdist[True, fname].prob(fval)
|
||||
) / negative_prob_prior
|
||||
# TODO: We need to add some kind of smoothing here, instead of
|
||||
# setting negative probabilities to zero and normalizing.
|
||||
negative_feature_probs[fval] = max(prob, 0.0)
|
||||
feature_probdist[False, fname] = DictionaryProbDist(
|
||||
negative_feature_probs, normalize=True
|
||||
)
|
||||
|
||||
return PositiveNaiveBayesClassifier(label_probdist, feature_probdist)
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Demo
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.classify.util import partial_names_demo
|
||||
|
||||
classifier = partial_names_demo(PositiveNaiveBayesClassifier.train)
|
||||
classifier.show_most_informative_features()
|
||||
183
backend/venv/Lib/site-packages/nltk/classify/rte_classify.py
Normal file
183
backend/venv/Lib/site-packages/nltk/classify/rte_classify.py
Normal file
@@ -0,0 +1,183 @@
|
||||
# Natural Language Toolkit: RTE Classifier
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Simple classifier for RTE corpus.
|
||||
|
||||
It calculates the overlap in words and named entities between text and
|
||||
hypothesis, and also whether there are words / named entities in the
|
||||
hypothesis which fail to occur in the text, since this is an indicator that
|
||||
the hypothesis is more informative than (i.e not entailed by) the text.
|
||||
|
||||
TO DO: better Named Entity classification
|
||||
TO DO: add lemmatization
|
||||
"""
|
||||
|
||||
from nltk.classify.maxent import MaxentClassifier
|
||||
from nltk.classify.util import accuracy
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
|
||||
|
||||
class RTEFeatureExtractor:
|
||||
"""
|
||||
This builds a bag of words for both the text and the hypothesis after
|
||||
throwing away some stopwords, then calculates overlap and difference.
|
||||
"""
|
||||
|
||||
def __init__(self, rtepair, stop=True, use_lemmatize=False):
|
||||
"""
|
||||
:param rtepair: a ``RTEPair`` from which features should be extracted
|
||||
:param stop: if ``True``, stopwords are thrown away.
|
||||
:type stop: bool
|
||||
"""
|
||||
self.stop = stop
|
||||
self.stopwords = {
|
||||
"a",
|
||||
"the",
|
||||
"it",
|
||||
"they",
|
||||
"of",
|
||||
"in",
|
||||
"to",
|
||||
"is",
|
||||
"have",
|
||||
"are",
|
||||
"were",
|
||||
"and",
|
||||
"very",
|
||||
".",
|
||||
",",
|
||||
}
|
||||
|
||||
self.negwords = {"no", "not", "never", "failed", "rejected", "denied"}
|
||||
# Try to tokenize so that abbreviations, monetary amounts, email
|
||||
# addresses, URLs are single tokens.
|
||||
tokenizer = RegexpTokenizer(r"[\w.@:/]+|\w+|\$[\d.]+")
|
||||
|
||||
# Get the set of word types for text and hypothesis
|
||||
self.text_tokens = tokenizer.tokenize(rtepair.text)
|
||||
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
|
||||
self.text_words = set(self.text_tokens)
|
||||
self.hyp_words = set(self.hyp_tokens)
|
||||
|
||||
if use_lemmatize:
|
||||
self.text_words = {self._lemmatize(token) for token in self.text_tokens}
|
||||
self.hyp_words = {self._lemmatize(token) for token in self.hyp_tokens}
|
||||
|
||||
if self.stop:
|
||||
self.text_words = self.text_words - self.stopwords
|
||||
self.hyp_words = self.hyp_words - self.stopwords
|
||||
|
||||
self._overlap = self.hyp_words & self.text_words
|
||||
self._hyp_extra = self.hyp_words - self.text_words
|
||||
self._txt_extra = self.text_words - self.hyp_words
|
||||
|
||||
def overlap(self, toktype, debug=False):
|
||||
"""
|
||||
Compute the overlap between text and hypothesis.
|
||||
|
||||
:param toktype: distinguish Named Entities from ordinary words
|
||||
:type toktype: 'ne' or 'word'
|
||||
"""
|
||||
ne_overlap = {token for token in self._overlap if self._ne(token)}
|
||||
if toktype == "ne":
|
||||
if debug:
|
||||
print("ne overlap", ne_overlap)
|
||||
return ne_overlap
|
||||
elif toktype == "word":
|
||||
if debug:
|
||||
print("word overlap", self._overlap - ne_overlap)
|
||||
return self._overlap - ne_overlap
|
||||
else:
|
||||
raise ValueError("Type not recognized:'%s'" % toktype)
|
||||
|
||||
def hyp_extra(self, toktype, debug=True):
|
||||
"""
|
||||
Compute the extraneous material in the hypothesis.
|
||||
|
||||
:param toktype: distinguish Named Entities from ordinary words
|
||||
:type toktype: 'ne' or 'word'
|
||||
"""
|
||||
ne_extra = {token for token in self._hyp_extra if self._ne(token)}
|
||||
if toktype == "ne":
|
||||
return ne_extra
|
||||
elif toktype == "word":
|
||||
return self._hyp_extra - ne_extra
|
||||
else:
|
||||
raise ValueError("Type not recognized: '%s'" % toktype)
|
||||
|
||||
@staticmethod
|
||||
def _ne(token):
|
||||
"""
|
||||
This just assumes that words in all caps or titles are
|
||||
named entities.
|
||||
|
||||
:type token: str
|
||||
"""
|
||||
if token.istitle() or token.isupper():
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _lemmatize(word):
|
||||
"""
|
||||
Use morphy from WordNet to find the base form of verbs.
|
||||
"""
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
lemma = wn.morphy(word, pos=wn.VERB)
|
||||
if lemma is not None:
|
||||
return lemma
|
||||
return word
|
||||
|
||||
|
||||
def rte_features(rtepair):
|
||||
extractor = RTEFeatureExtractor(rtepair)
|
||||
features = {}
|
||||
features["alwayson"] = True
|
||||
features["word_overlap"] = len(extractor.overlap("word"))
|
||||
features["word_hyp_extra"] = len(extractor.hyp_extra("word"))
|
||||
features["ne_overlap"] = len(extractor.overlap("ne"))
|
||||
features["ne_hyp_extra"] = len(extractor.hyp_extra("ne"))
|
||||
features["neg_txt"] = len(extractor.negwords & extractor.text_words)
|
||||
features["neg_hyp"] = len(extractor.negwords & extractor.hyp_words)
|
||||
return features
|
||||
|
||||
|
||||
def rte_featurize(rte_pairs):
|
||||
return [(rte_features(pair), pair.value) for pair in rte_pairs]
|
||||
|
||||
|
||||
def rte_classifier(algorithm, sample_N=None):
|
||||
from nltk.corpus import rte as rte_corpus
|
||||
|
||||
train_set = rte_corpus.pairs(["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"])
|
||||
test_set = rte_corpus.pairs(["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"])
|
||||
|
||||
if sample_N is not None:
|
||||
train_set = train_set[:sample_N]
|
||||
test_set = test_set[:sample_N]
|
||||
|
||||
featurized_train_set = rte_featurize(train_set)
|
||||
featurized_test_set = rte_featurize(test_set)
|
||||
|
||||
# Train the classifier
|
||||
print("Training classifier...")
|
||||
if algorithm in ["megam"]: # MEGAM based algorithms.
|
||||
clf = MaxentClassifier.train(featurized_train_set, algorithm)
|
||||
elif algorithm in ["GIS", "IIS"]: # Use default GIS/IIS MaxEnt algorithm
|
||||
clf = MaxentClassifier.train(featurized_train_set, algorithm)
|
||||
else:
|
||||
err_msg = str(
|
||||
"RTEClassifier only supports these algorithms:\n "
|
||||
"'megam', 'GIS', 'IIS'.\n"
|
||||
)
|
||||
raise Exception(err_msg)
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(clf, featurized_test_set)
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
return clf
|
||||
143
backend/venv/Lib/site-packages/nltk/classify/scikitlearn.py
Normal file
143
backend/venv/Lib/site-packages/nltk/classify/scikitlearn.py
Normal file
@@ -0,0 +1,143 @@
|
||||
# Natural Language Toolkit: Interface to scikit-learn classifiers
|
||||
#
|
||||
# Author: Lars Buitinck <L.J.Buitinck@uva.nl>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
scikit-learn (https://scikit-learn.org) is a machine learning library for
|
||||
Python. It supports many classification algorithms, including SVMs,
|
||||
Naive Bayes, logistic regression (MaxEnt) and decision trees.
|
||||
|
||||
This package implements a wrapper around scikit-learn classifiers. To use this
|
||||
wrapper, construct a scikit-learn estimator object, then use that to construct
|
||||
a SklearnClassifier. E.g., to wrap a linear SVM with default settings:
|
||||
|
||||
>>> from sklearn.svm import LinearSVC
|
||||
>>> from nltk.classify.scikitlearn import SklearnClassifier
|
||||
>>> classif = SklearnClassifier(LinearSVC())
|
||||
|
||||
A scikit-learn classifier may include preprocessing steps when it's wrapped
|
||||
in a Pipeline object. The following constructs and wraps a Naive Bayes text
|
||||
classifier with tf-idf weighting and chi-square feature selection to get the
|
||||
best 1000 features:
|
||||
|
||||
>>> from sklearn.feature_extraction.text import TfidfTransformer
|
||||
>>> from sklearn.feature_selection import SelectKBest, chi2
|
||||
>>> from sklearn.naive_bayes import MultinomialNB
|
||||
>>> from sklearn.pipeline import Pipeline
|
||||
>>> pipeline = Pipeline([('tfidf', TfidfTransformer()),
|
||||
... ('chi2', SelectKBest(chi2, k=1000)),
|
||||
... ('nb', MultinomialNB())])
|
||||
>>> classif = SklearnClassifier(pipeline)
|
||||
"""
|
||||
|
||||
from nltk.classify.api import ClassifierI
|
||||
from nltk.probability import DictionaryProbDist
|
||||
|
||||
try:
|
||||
from sklearn.feature_extraction import DictVectorizer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
__all__ = ["SklearnClassifier"]
|
||||
|
||||
|
||||
class SklearnClassifier(ClassifierI):
|
||||
"""Wrapper for scikit-learn classifiers."""
|
||||
|
||||
def __init__(self, estimator, dtype=float, sparse=True):
|
||||
"""
|
||||
:param estimator: scikit-learn classifier object.
|
||||
|
||||
:param dtype: data type used when building feature array.
|
||||
scikit-learn estimators work exclusively on numeric data. The
|
||||
default value should be fine for almost all situations.
|
||||
|
||||
:param sparse: Whether to use sparse matrices internally.
|
||||
The estimator must support these; not all scikit-learn classifiers
|
||||
do (see their respective documentation and look for "sparse
|
||||
matrix"). The default value is True, since most NLP problems
|
||||
involve sparse feature sets. Setting this to False may take a
|
||||
great amount of memory.
|
||||
:type sparse: boolean.
|
||||
"""
|
||||
self._clf = estimator
|
||||
self._encoder = LabelEncoder()
|
||||
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
|
||||
|
||||
def __repr__(self):
|
||||
return "<SklearnClassifier(%r)>" % self._clf
|
||||
|
||||
def classify_many(self, featuresets):
|
||||
"""Classify a batch of samples.
|
||||
|
||||
:param featuresets: An iterable over featuresets, each a dict mapping
|
||||
strings to either numbers, booleans or strings.
|
||||
:return: The predicted class label for each input sample.
|
||||
:rtype: list
|
||||
"""
|
||||
X = self._vectorizer.transform(featuresets)
|
||||
classes = self._encoder.classes_
|
||||
return [classes[i] for i in self._clf.predict(X)]
|
||||
|
||||
def prob_classify_many(self, featuresets):
|
||||
"""Compute per-class probabilities for a batch of samples.
|
||||
|
||||
:param featuresets: An iterable over featuresets, each a dict mapping
|
||||
strings to either numbers, booleans or strings.
|
||||
:rtype: list of ``ProbDistI``
|
||||
"""
|
||||
X = self._vectorizer.transform(featuresets)
|
||||
y_proba_list = self._clf.predict_proba(X)
|
||||
return [self._make_probdist(y_proba) for y_proba in y_proba_list]
|
||||
|
||||
def labels(self):
|
||||
"""The class labels used by this classifier.
|
||||
|
||||
:rtype: list
|
||||
"""
|
||||
return list(self._encoder.classes_)
|
||||
|
||||
def train(self, labeled_featuresets):
|
||||
"""
|
||||
Train (fit) the scikit-learn estimator.
|
||||
|
||||
:param labeled_featuresets: A list of ``(featureset, label)``
|
||||
where each ``featureset`` is a dict mapping strings to either
|
||||
numbers, booleans or strings.
|
||||
"""
|
||||
|
||||
X, y = list(zip(*labeled_featuresets))
|
||||
X = self._vectorizer.fit_transform(X)
|
||||
y = self._encoder.fit_transform(y)
|
||||
self._clf.fit(X, y)
|
||||
|
||||
return self
|
||||
|
||||
def _make_probdist(self, y_proba):
|
||||
classes = self._encoder.classes_
|
||||
return DictionaryProbDist({classes[i]: p for i, p in enumerate(y_proba)})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.naive_bayes import BernoulliNB
|
||||
|
||||
from nltk.classify.util import names_demo, names_demo_features
|
||||
|
||||
# Bernoulli Naive Bayes is designed for binary classification. We set the
|
||||
# binarize option to False since we know we're passing boolean features.
|
||||
print("scikit-learn Naive Bayes:")
|
||||
names_demo(
|
||||
SklearnClassifier(BernoulliNB(binarize=False)).train,
|
||||
features=names_demo_features,
|
||||
)
|
||||
|
||||
# The C parameter on logistic regression (MaxEnt) controls regularization.
|
||||
# The higher it's set, the less regularized the classifier is.
|
||||
print("\n\nscikit-learn logistic regression:")
|
||||
names_demo(
|
||||
SklearnClassifier(LogisticRegression(C=1000)).train,
|
||||
features=names_demo_features,
|
||||
)
|
||||
175
backend/venv/Lib/site-packages/nltk/classify/senna.py
Normal file
175
backend/venv/Lib/site-packages/nltk/classify/senna.py
Normal file
@@ -0,0 +1,175 @@
|
||||
# Natural Language Toolkit: Senna Interface
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A general interface to the SENNA pipeline that supports any of the
|
||||
operations specified in SUPPORTED_OPERATIONS.
|
||||
|
||||
Applying multiple operations at once has the speed advantage. For example,
|
||||
Senna will automatically determine POS tags if you are extracting named
|
||||
entities. Applying both of the operations will cost only the time of
|
||||
extracting the named entities.
|
||||
|
||||
The SENNA pipeline has a fixed maximum size of the sentences that it can read.
|
||||
By default it is 1024 token/sentence. If you have larger sentences, changing
|
||||
the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
|
||||
system specific binary should be rebuilt. Otherwise this could introduce
|
||||
misalignment errors.
|
||||
|
||||
The input is:
|
||||
|
||||
- path to the directory that contains SENNA executables. If the path is incorrect,
|
||||
Senna will automatically search for executable file specified in SENNA environment variable
|
||||
- List of the operations needed to be performed.
|
||||
- (optionally) the encoding of the input data (default:utf-8)
|
||||
|
||||
Note: Unit tests for this module can be found in test/unit/test_senna.py
|
||||
|
||||
>>> from nltk.classify import Senna
|
||||
>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) # doctest: +SKIP
|
||||
>>> sent = 'Dusseldorf is an international business center'.split()
|
||||
>>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP
|
||||
[('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
|
||||
('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
|
||||
"""
|
||||
|
||||
from os import environ, path, sep
|
||||
from platform import architecture, system
|
||||
from subprocess import PIPE, Popen
|
||||
|
||||
from nltk.tag.api import TaggerI
|
||||
|
||||
|
||||
class Senna(TaggerI):
|
||||
SUPPORTED_OPERATIONS = ["pos", "chk", "ner"]
|
||||
|
||||
def __init__(self, senna_path, operations, encoding="utf-8"):
|
||||
self._encoding = encoding
|
||||
self._path = path.normpath(senna_path) + sep
|
||||
|
||||
# Verifies the existence of the executable on the self._path first
|
||||
# senna_binary_file_1 = self.executable(self._path)
|
||||
exe_file_1 = self.executable(self._path)
|
||||
if not path.isfile(exe_file_1):
|
||||
# Check for the system environment
|
||||
if "SENNA" in environ:
|
||||
# self._path = path.join(environ['SENNA'],'')
|
||||
self._path = path.normpath(environ["SENNA"]) + sep
|
||||
exe_file_2 = self.executable(self._path)
|
||||
if not path.isfile(exe_file_2):
|
||||
raise LookupError(
|
||||
"Senna executable expected at %s or %s but not found"
|
||||
% (exe_file_1, exe_file_2)
|
||||
)
|
||||
|
||||
self.operations = operations
|
||||
|
||||
def executable(self, base_path):
|
||||
"""
|
||||
The function that determines the system specific binary that should be
|
||||
used in the pipeline. In case, the system is not known the default senna binary will
|
||||
be used.
|
||||
"""
|
||||
os_name = system()
|
||||
if os_name == "Linux":
|
||||
bits = architecture()[0]
|
||||
if bits == "64bit":
|
||||
return path.join(base_path, "senna-linux64")
|
||||
return path.join(base_path, "senna-linux32")
|
||||
if os_name == "Windows":
|
||||
return path.join(base_path, "senna-win32.exe")
|
||||
if os_name == "Darwin":
|
||||
return path.join(base_path, "senna-osx")
|
||||
return path.join(base_path, "senna")
|
||||
|
||||
def _map(self):
|
||||
"""
|
||||
A method that calculates the order of the columns that SENNA pipeline
|
||||
will output the tags into. This depends on the operations being ordered.
|
||||
"""
|
||||
_map = {}
|
||||
i = 1
|
||||
for operation in Senna.SUPPORTED_OPERATIONS:
|
||||
if operation in self.operations:
|
||||
_map[operation] = i
|
||||
i += 1
|
||||
return _map
|
||||
|
||||
def tag(self, tokens):
|
||||
"""
|
||||
Applies the specified operation(s) on a list of tokens.
|
||||
"""
|
||||
return self.tag_sents([tokens])[0]
|
||||
|
||||
def tag_sents(self, sentences):
|
||||
"""
|
||||
Applies the tag method over a list of sentences. This method will return a
|
||||
list of dictionaries. Every dictionary will contain a word with its
|
||||
calculated annotations/tags.
|
||||
"""
|
||||
encoding = self._encoding
|
||||
|
||||
if not path.isfile(self.executable(self._path)):
|
||||
raise LookupError(
|
||||
"Senna executable expected at %s but not found"
|
||||
% self.executable(self._path)
|
||||
)
|
||||
|
||||
# Build the senna command to run the tagger
|
||||
_senna_cmd = [
|
||||
self.executable(self._path),
|
||||
"-path",
|
||||
self._path,
|
||||
"-usrtokens",
|
||||
"-iobtags",
|
||||
]
|
||||
_senna_cmd.extend(["-" + op for op in self.operations])
|
||||
|
||||
# Serialize the actual sentences to a temporary string
|
||||
_input = "\n".join(" ".join(x) for x in sentences) + "\n"
|
||||
if isinstance(_input, str) and encoding:
|
||||
_input = _input.encode(encoding)
|
||||
|
||||
# Run the tagger and get the output
|
||||
p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
|
||||
(stdout, stderr) = p.communicate(input=_input)
|
||||
senna_output = stdout
|
||||
|
||||
# Check the return code.
|
||||
if p.returncode != 0:
|
||||
raise RuntimeError("Senna command failed! Details: %s" % stderr)
|
||||
|
||||
if encoding:
|
||||
senna_output = stdout.decode(encoding)
|
||||
|
||||
# Output the tagged sentences
|
||||
map_ = self._map()
|
||||
tagged_sentences = [[]]
|
||||
sentence_index = 0
|
||||
token_index = 0
|
||||
for tagged_word in senna_output.strip().split("\n"):
|
||||
if not tagged_word:
|
||||
tagged_sentences.append([])
|
||||
sentence_index += 1
|
||||
token_index = 0
|
||||
continue
|
||||
tags = tagged_word.split("\t")
|
||||
result = {}
|
||||
for tag in map_:
|
||||
result[tag] = tags[map_[tag]].strip()
|
||||
try:
|
||||
result["word"] = sentences[sentence_index][token_index]
|
||||
except IndexError as e:
|
||||
raise IndexError(
|
||||
"Misalignment error occurred at sentence number %d. Possible reason"
|
||||
" is that the sentence size exceeded the maximum size. Check the "
|
||||
"documentation of Senna class for more information."
|
||||
% sentence_index
|
||||
) from e
|
||||
tagged_sentences[-1].append(result)
|
||||
token_index += 1
|
||||
return tagged_sentences
|
||||
17
backend/venv/Lib/site-packages/nltk/classify/svm.py
Normal file
17
backend/venv/Lib/site-packages/nltk/classify/svm.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# Natural Language Toolkit: SVM-based classifier
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Leon Derczynski <leon@dcs.shef.ac.uk>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
nltk.classify.svm was deprecated. For classification based
|
||||
on support vector machines SVMs use nltk.classify.scikitlearn
|
||||
(or `scikit-learn <https://scikit-learn.org>`_ directly).
|
||||
"""
|
||||
|
||||
|
||||
class SvmClassifier:
|
||||
def __init__(self, *args, **kwargs):
|
||||
raise NotImplementedError(__doc__)
|
||||
122
backend/venv/Lib/site-packages/nltk/classify/tadm.py
Normal file
122
backend/venv/Lib/site-packages/nltk/classify/tadm.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# Natural Language Toolkit: Interface to TADM Classifier
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Joseph Frazee <jfrazee@mail.utexas.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from nltk.internals import find_binary
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
_tadm_bin = None
|
||||
|
||||
|
||||
def config_tadm(bin=None):
|
||||
global _tadm_bin
|
||||
_tadm_bin = find_binary(
|
||||
"tadm", bin, env_vars=["TADM"], binary_names=["tadm"], url="http://tadm.sf.net"
|
||||
)
|
||||
|
||||
|
||||
def write_tadm_file(train_toks, encoding, stream):
|
||||
"""
|
||||
Generate an input file for ``tadm`` based on the given corpus of
|
||||
classified tokens.
|
||||
|
||||
:type train_toks: list(tuple(dict, str))
|
||||
:param train_toks: Training data, represented as a list of
|
||||
pairs, the first member of which is a feature dictionary,
|
||||
and the second of which is a classification label.
|
||||
:type encoding: TadmEventMaxentFeatureEncoding
|
||||
:param encoding: A feature encoding, used to convert featuresets
|
||||
into feature vectors.
|
||||
:type stream: stream
|
||||
:param stream: The stream to which the ``tadm`` input file should be
|
||||
written.
|
||||
"""
|
||||
# See the following for a file format description:
|
||||
#
|
||||
# https://sf.net/forum/forum.php?thread_id=1391502&forum_id=473054
|
||||
# https://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054
|
||||
labels = encoding.labels()
|
||||
for featureset, label in train_toks:
|
||||
length_line = "%d\n" % len(labels)
|
||||
stream.write(length_line)
|
||||
for known_label in labels:
|
||||
v = encoding.encode(featureset, known_label)
|
||||
line = "%d %d %s\n" % (
|
||||
int(label == known_label),
|
||||
len(v),
|
||||
" ".join("%d %d" % u for u in v),
|
||||
)
|
||||
stream.write(line)
|
||||
|
||||
|
||||
def parse_tadm_weights(paramfile):
|
||||
"""
|
||||
Given the stdout output generated by ``tadm`` when training a
|
||||
model, return a ``numpy`` array containing the corresponding weight
|
||||
vector.
|
||||
"""
|
||||
weights = []
|
||||
for line in paramfile:
|
||||
weights.append(float(line.strip()))
|
||||
return numpy.array(weights, "d")
|
||||
|
||||
|
||||
def call_tadm(args):
|
||||
"""
|
||||
Call the ``tadm`` binary with the given arguments.
|
||||
"""
|
||||
if isinstance(args, str):
|
||||
raise TypeError("args should be a list of strings")
|
||||
if _tadm_bin is None:
|
||||
config_tadm()
|
||||
|
||||
# Call tadm via a subprocess
|
||||
cmd = [_tadm_bin] + args
|
||||
p = subprocess.Popen(cmd, stdout=sys.stdout)
|
||||
(stdout, stderr) = p.communicate()
|
||||
|
||||
# Check the return code.
|
||||
if p.returncode != 0:
|
||||
print()
|
||||
print(stderr)
|
||||
raise OSError("tadm command failed!")
|
||||
|
||||
|
||||
def names_demo():
|
||||
from nltk.classify.maxent import TadmMaxentClassifier
|
||||
from nltk.classify.util import names_demo
|
||||
|
||||
classifier = names_demo(TadmMaxentClassifier.train)
|
||||
|
||||
|
||||
def encoding_demo():
|
||||
import sys
|
||||
|
||||
from nltk.classify.maxent import TadmEventMaxentFeatureEncoding
|
||||
|
||||
tokens = [
|
||||
({"f0": 1, "f1": 1, "f3": 1}, "A"),
|
||||
({"f0": 1, "f2": 1, "f4": 1}, "B"),
|
||||
({"f0": 2, "f2": 1, "f3": 1, "f4": 1}, "A"),
|
||||
]
|
||||
encoding = TadmEventMaxentFeatureEncoding.train(tokens)
|
||||
write_tadm_file(tokens, encoding, sys.stdout)
|
||||
print()
|
||||
for i in range(encoding.length()):
|
||||
print("%s --> %d" % (encoding.describe(i), i))
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
encoding_demo()
|
||||
names_demo()
|
||||
193
backend/venv/Lib/site-packages/nltk/classify/textcat.py
Normal file
193
backend/venv/Lib/site-packages/nltk/classify/textcat.py
Normal file
@@ -0,0 +1,193 @@
|
||||
# Natural Language Toolkit: Language ID module using TextCat algorithm
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Avital Pekker <avital.pekker@utoronto.ca>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A module for language identification using the TextCat algorithm.
|
||||
An implementation of the text categorization algorithm
|
||||
presented in Cavnar, W. B. and J. M. Trenkle,
|
||||
"N-Gram-Based Text Categorization".
|
||||
|
||||
The algorithm takes advantage of Zipf's law and uses
|
||||
n-gram frequencies to profile languages and text-yet to
|
||||
be identified-then compares using a distance measure.
|
||||
|
||||
Language n-grams are provided by the "An Crubadan"
|
||||
project. A corpus reader was created separately to read
|
||||
those files.
|
||||
|
||||
For details regarding the algorithm, see:
|
||||
https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
|
||||
|
||||
For details about An Crubadan, see:
|
||||
https://borel.slu.edu/crubadan/index.html
|
||||
"""
|
||||
|
||||
from sys import maxsize
|
||||
|
||||
from nltk.util import trigrams
|
||||
|
||||
# Note: this is NOT "re" you're likely used to. The regex module
|
||||
# is an alternative to the standard re module that supports
|
||||
# Unicode codepoint properties with the \p{} syntax.
|
||||
# You may have to "pip install regx"
|
||||
try:
|
||||
import regex as re
|
||||
except ImportError:
|
||||
re = None
|
||||
######################################################################
|
||||
## Language identification using TextCat
|
||||
######################################################################
|
||||
|
||||
|
||||
class TextCat:
|
||||
_corpus = None
|
||||
fingerprints = {}
|
||||
_START_CHAR = "<"
|
||||
_END_CHAR = ">"
|
||||
|
||||
last_distances = {}
|
||||
|
||||
def __init__(self):
|
||||
if not re:
|
||||
raise OSError(
|
||||
"classify.textcat requires the regex module that "
|
||||
"supports unicode. Try '$ pip install regex' and "
|
||||
"see https://pypi.python.org/pypi/regex for "
|
||||
"further details."
|
||||
)
|
||||
|
||||
from nltk.corpus import crubadan
|
||||
|
||||
self._corpus = crubadan
|
||||
# Load all language ngrams into cache
|
||||
for lang in self._corpus.langs():
|
||||
self._corpus.lang_freq(lang)
|
||||
|
||||
def remove_punctuation(self, text):
|
||||
"""Get rid of punctuation except apostrophes"""
|
||||
return re.sub(r"[^\P{P}\']+", "", text)
|
||||
|
||||
def profile(self, text):
|
||||
"""Create FreqDist of trigrams within text"""
|
||||
from nltk import FreqDist, word_tokenize
|
||||
|
||||
clean_text = self.remove_punctuation(text)
|
||||
tokens = word_tokenize(clean_text)
|
||||
|
||||
fingerprint = FreqDist()
|
||||
for t in tokens:
|
||||
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
|
||||
token_trigrams = ["".join(tri) for tri in token_trigram_tuples]
|
||||
|
||||
for cur_trigram in token_trigrams:
|
||||
if cur_trigram in fingerprint:
|
||||
fingerprint[cur_trigram] += 1
|
||||
else:
|
||||
fingerprint[cur_trigram] = 1
|
||||
|
||||
return fingerprint
|
||||
|
||||
def calc_dist(self, lang, trigram, text_profile):
|
||||
"""Calculate the "out-of-place" measure between the
|
||||
text and language profile for a single trigram"""
|
||||
|
||||
lang_fd = self._corpus.lang_freq(lang)
|
||||
dist = 0
|
||||
|
||||
if trigram in lang_fd:
|
||||
idx_lang_profile = list(lang_fd.keys()).index(trigram)
|
||||
idx_text = list(text_profile.keys()).index(trigram)
|
||||
|
||||
# print(idx_lang_profile, ", ", idx_text)
|
||||
dist = abs(idx_lang_profile - idx_text)
|
||||
else:
|
||||
# Arbitrary but should be larger than
|
||||
# any possible trigram file length
|
||||
# in terms of total lines
|
||||
dist = maxsize
|
||||
|
||||
return dist
|
||||
|
||||
def lang_dists(self, text):
|
||||
"""Calculate the "out-of-place" measure between
|
||||
the text and all languages"""
|
||||
|
||||
distances = {}
|
||||
profile = self.profile(text)
|
||||
# For all the languages
|
||||
for lang in self._corpus._all_lang_freq.keys():
|
||||
# Calculate distance metric for every trigram in
|
||||
# input text to be identified
|
||||
lang_dist = 0
|
||||
for trigram in profile:
|
||||
lang_dist += self.calc_dist(lang, trigram, profile)
|
||||
|
||||
distances[lang] = lang_dist
|
||||
|
||||
return distances
|
||||
|
||||
def guess_language(self, text):
|
||||
"""Find the language with the min distance
|
||||
to the text and return its ISO 639-3 code"""
|
||||
self.last_distances = self.lang_dists(text)
|
||||
|
||||
return min(self.last_distances, key=self.last_distances.get)
|
||||
#################################################')
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.corpus import udhr
|
||||
|
||||
langs = [
|
||||
"Kurdish-UTF8",
|
||||
"Abkhaz-UTF8",
|
||||
"Farsi_Persian-UTF8",
|
||||
"Hindi-UTF8",
|
||||
"Hawaiian-UTF8",
|
||||
"Russian-UTF8",
|
||||
"Vietnamese-UTF8",
|
||||
"Serbian_Srpski-UTF8",
|
||||
"Esperanto-UTF8",
|
||||
]
|
||||
|
||||
friendly = {
|
||||
"kmr": "Northern Kurdish",
|
||||
"abk": "Abkhazian",
|
||||
"pes": "Iranian Persian",
|
||||
"hin": "Hindi",
|
||||
"haw": "Hawaiian",
|
||||
"rus": "Russian",
|
||||
"vie": "Vietnamese",
|
||||
"srp": "Serbian",
|
||||
"epo": "Esperanto",
|
||||
}
|
||||
|
||||
tc = TextCat()
|
||||
|
||||
for cur_lang in langs:
|
||||
# Get raw data from UDHR corpus
|
||||
raw_sentences = udhr.sents(cur_lang)
|
||||
rows = len(raw_sentences) - 1
|
||||
cols = list(map(len, raw_sentences))
|
||||
|
||||
sample = ""
|
||||
|
||||
# Generate a sample text of the language
|
||||
for i in range(0, rows):
|
||||
cur_sent = " " + " ".join([raw_sentences[i][j] for j in range(0, cols[i])])
|
||||
sample += cur_sent
|
||||
|
||||
# Try to detect what it is
|
||||
print("Language snippet: " + sample[0:140] + "...")
|
||||
guess = tc.guess_language(sample)
|
||||
print(f"Language detection: {guess} ({friendly[guess]})")
|
||||
print("#" * 140)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
347
backend/venv/Lib/site-packages/nltk/classify/util.py
Normal file
347
backend/venv/Lib/site-packages/nltk/classify/util.py
Normal file
@@ -0,0 +1,347 @@
|
||||
# Natural Language Toolkit: Classifier Utility Functions
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Utility functions and classes for classifiers.
|
||||
"""
|
||||
|
||||
import math
|
||||
|
||||
# from nltk.util import Deprecated
|
||||
import nltk.classify.util # for accuracy & log_likelihood
|
||||
from nltk.util import LazyMap
|
||||
|
||||
######################################################################
|
||||
# { Helper Functions
|
||||
######################################################################
|
||||
|
||||
|
||||
# alternative name possibility: 'map_featurefunc()'?
|
||||
# alternative name possibility: 'detect_features()'?
|
||||
# alternative name possibility: 'map_featuredetect()'?
|
||||
# or.. just have users use LazyMap directly?
|
||||
def apply_features(feature_func, toks, labeled=None):
|
||||
"""
|
||||
Use the ``LazyMap`` class to construct a lazy list-like
|
||||
object that is analogous to ``map(feature_func, toks)``. In
|
||||
particular, if ``labeled=False``, then the returned list-like
|
||||
object's values are equal to::
|
||||
|
||||
[feature_func(tok) for tok in toks]
|
||||
|
||||
If ``labeled=True``, then the returned list-like object's values
|
||||
are equal to::
|
||||
|
||||
[(feature_func(tok), label) for (tok, label) in toks]
|
||||
|
||||
The primary purpose of this function is to avoid the memory
|
||||
overhead involved in storing all the featuresets for every token
|
||||
in a corpus. Instead, these featuresets are constructed lazily,
|
||||
as-needed. The reduction in memory overhead can be especially
|
||||
significant when the underlying list of tokens is itself lazy (as
|
||||
is the case with many corpus readers).
|
||||
|
||||
:param feature_func: The function that will be applied to each
|
||||
token. It should return a featureset -- i.e., a dict
|
||||
mapping feature names to feature values.
|
||||
:param toks: The list of tokens to which ``feature_func`` should be
|
||||
applied. If ``labeled=True``, then the list elements will be
|
||||
passed directly to ``feature_func()``. If ``labeled=False``,
|
||||
then the list elements should be tuples ``(tok,label)``, and
|
||||
``tok`` will be passed to ``feature_func()``.
|
||||
:param labeled: If true, then ``toks`` contains labeled tokens --
|
||||
i.e., tuples of the form ``(tok, label)``. (Default:
|
||||
auto-detect based on types.)
|
||||
"""
|
||||
if labeled is None:
|
||||
labeled = toks and isinstance(toks[0], (tuple, list))
|
||||
if labeled:
|
||||
|
||||
def lazy_func(labeled_token):
|
||||
return (feature_func(labeled_token[0]), labeled_token[1])
|
||||
|
||||
return LazyMap(lazy_func, toks)
|
||||
else:
|
||||
return LazyMap(feature_func, toks)
|
||||
|
||||
|
||||
def attested_labels(tokens):
|
||||
"""
|
||||
:return: A list of all labels that are attested in the given list
|
||||
of tokens.
|
||||
:rtype: list of (immutable)
|
||||
:param tokens: The list of classified tokens from which to extract
|
||||
labels. A classified token has the form ``(token, label)``.
|
||||
:type tokens: list
|
||||
"""
|
||||
return tuple({label for (tok, label) in tokens})
|
||||
|
||||
|
||||
def log_likelihood(classifier, gold):
|
||||
results = classifier.prob_classify_many([fs for (fs, l) in gold])
|
||||
ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
|
||||
return math.log(sum(ll) / len(ll))
|
||||
|
||||
|
||||
def accuracy(classifier, gold):
|
||||
results = classifier.classify_many([fs for (fs, l) in gold])
|
||||
correct = [l == r for ((fs, l), r) in zip(gold, results)]
|
||||
if correct:
|
||||
return sum(correct) / len(correct)
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
class CutoffChecker:
|
||||
"""
|
||||
A helper class that implements cutoff checks based on number of
|
||||
iterations and log likelihood.
|
||||
|
||||
Accuracy cutoffs are also implemented, but they're almost never
|
||||
a good idea to use.
|
||||
"""
|
||||
|
||||
def __init__(self, cutoffs):
|
||||
self.cutoffs = cutoffs.copy()
|
||||
if "min_ll" in cutoffs:
|
||||
cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
|
||||
if "min_lldelta" in cutoffs:
|
||||
cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
|
||||
self.ll = None
|
||||
self.acc = None
|
||||
self.iter = 1
|
||||
|
||||
def check(self, classifier, train_toks):
|
||||
cutoffs = self.cutoffs
|
||||
self.iter += 1
|
||||
if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
|
||||
return True # iteration cutoff.
|
||||
|
||||
new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
|
||||
if math.isnan(new_ll):
|
||||
return True
|
||||
|
||||
if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
|
||||
if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
|
||||
return True # log likelihood cutoff
|
||||
if (
|
||||
"min_lldelta" in cutoffs
|
||||
and self.ll
|
||||
and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
|
||||
):
|
||||
return True # log likelihood delta cutoff
|
||||
self.ll = new_ll
|
||||
|
||||
if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
|
||||
new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
|
||||
if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
|
||||
return True # log likelihood cutoff
|
||||
if (
|
||||
"min_accdelta" in cutoffs
|
||||
and self.acc
|
||||
and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
|
||||
):
|
||||
return True # log likelihood delta cutoff
|
||||
self.acc = new_acc
|
||||
|
||||
return False # no cutoff reached.
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Demos
|
||||
######################################################################
|
||||
|
||||
|
||||
def names_demo_features(name):
|
||||
features = {}
|
||||
features["alwayson"] = True
|
||||
features["startswith"] = name[0].lower()
|
||||
features["endswith"] = name[-1].lower()
|
||||
for letter in "abcdefghijklmnopqrstuvwxyz":
|
||||
features["count(%s)" % letter] = name.lower().count(letter)
|
||||
features["has(%s)" % letter] = letter in name.lower()
|
||||
return features
|
||||
|
||||
|
||||
def binary_names_demo_features(name):
|
||||
features = {}
|
||||
features["alwayson"] = True
|
||||
features["startswith(vowel)"] = name[0].lower() in "aeiouy"
|
||||
features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
|
||||
for letter in "abcdefghijklmnopqrstuvwxyz":
|
||||
features["count(%s)" % letter] = name.lower().count(letter)
|
||||
features["has(%s)" % letter] = letter in name.lower()
|
||||
features["startswith(%s)" % letter] = letter == name[0].lower()
|
||||
features["endswith(%s)" % letter] = letter == name[-1].lower()
|
||||
return features
|
||||
|
||||
|
||||
def names_demo(trainer, features=names_demo_features):
|
||||
import random
|
||||
|
||||
from nltk.corpus import names
|
||||
|
||||
# Construct a list of classified names, using the names corpus.
|
||||
namelist = [(name, "male") for name in names.words("male.txt")] + [
|
||||
(name, "female") for name in names.words("female.txt")
|
||||
]
|
||||
|
||||
# Randomly split the names into a test & train set.
|
||||
random.seed(123456)
|
||||
random.shuffle(namelist)
|
||||
train = namelist[:5000]
|
||||
test = namelist[5000:5500]
|
||||
|
||||
# Train up a classifier.
|
||||
print("Training classifier...")
|
||||
classifier = trainer([(features(n), g) for (n, g) in train])
|
||||
|
||||
# Run the classifier on the test data.
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
|
||||
# For classifiers that can find probabilities, show the log
|
||||
# likelihood and some sample probability distributions.
|
||||
try:
|
||||
test_featuresets = [features(n) for (n, g) in test]
|
||||
pdists = classifier.prob_classify_many(test_featuresets)
|
||||
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
||||
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
||||
print()
|
||||
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
|
||||
for (name, gender), pdist in list(zip(test, pdists))[:5]:
|
||||
if gender == "male":
|
||||
fmt = " %-15s *%6.4f %6.4f"
|
||||
else:
|
||||
fmt = " %-15s %6.4f *%6.4f"
|
||||
print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# Return the classifier
|
||||
return classifier
|
||||
|
||||
|
||||
def partial_names_demo(trainer, features=names_demo_features):
|
||||
import random
|
||||
|
||||
from nltk.corpus import names
|
||||
|
||||
male_names = names.words("male.txt")
|
||||
female_names = names.words("female.txt")
|
||||
|
||||
random.seed(654321)
|
||||
random.shuffle(male_names)
|
||||
random.shuffle(female_names)
|
||||
|
||||
# Create a list of male names to be used as positive-labeled examples for training
|
||||
positive = map(features, male_names[:2000])
|
||||
|
||||
# Create a list of male and female names to be used as unlabeled examples
|
||||
unlabeled = map(features, male_names[2000:2500] + female_names[:500])
|
||||
|
||||
# Create a test set with correctly-labeled male and female names
|
||||
test = [(name, True) for name in male_names[2500:2750]] + [
|
||||
(name, False) for name in female_names[500:750]
|
||||
]
|
||||
|
||||
random.shuffle(test)
|
||||
|
||||
# Train up a classifier.
|
||||
print("Training classifier...")
|
||||
classifier = trainer(positive, unlabeled)
|
||||
|
||||
# Run the classifier on the test data.
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
|
||||
# For classifiers that can find probabilities, show the log
|
||||
# likelihood and some sample probability distributions.
|
||||
try:
|
||||
test_featuresets = [features(n) for (n, m) in test]
|
||||
pdists = classifier.prob_classify_many(test_featuresets)
|
||||
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
||||
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
||||
print()
|
||||
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
|
||||
for (name, is_male), pdist in zip(test, pdists)[:5]:
|
||||
if is_male == True:
|
||||
fmt = " %-15s *%6.4f %6.4f"
|
||||
else:
|
||||
fmt = " %-15s %6.4f *%6.4f"
|
||||
print(fmt % (name, pdist.prob(True), pdist.prob(False)))
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# Return the classifier
|
||||
return classifier
|
||||
|
||||
|
||||
_inst_cache = {}
|
||||
|
||||
|
||||
def wsd_demo(trainer, word, features, n=1000):
|
||||
import random
|
||||
|
||||
from nltk.corpus import senseval
|
||||
|
||||
# Get the instances.
|
||||
print("Reading data...")
|
||||
global _inst_cache
|
||||
if word not in _inst_cache:
|
||||
_inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
|
||||
instances = _inst_cache[word][:]
|
||||
if n > len(instances):
|
||||
n = len(instances)
|
||||
senses = list({l for (i, l) in instances})
|
||||
print(" Senses: " + " ".join(senses))
|
||||
|
||||
# Randomly split the names into a test & train set.
|
||||
print("Splitting into test & train...")
|
||||
random.seed(123456)
|
||||
random.shuffle(instances)
|
||||
train = instances[: int(0.8 * n)]
|
||||
test = instances[int(0.8 * n) : n]
|
||||
|
||||
# Train up a classifier.
|
||||
print("Training classifier...")
|
||||
classifier = trainer([(features(i), l) for (i, l) in train])
|
||||
|
||||
# Run the classifier on the test data.
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
|
||||
# For classifiers that can find probabilities, show the log
|
||||
# likelihood and some sample probability distributions.
|
||||
try:
|
||||
test_featuresets = [features(i) for (i, n) in test]
|
||||
pdists = classifier.prob_classify_many(test_featuresets)
|
||||
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
||||
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# Return the classifier
|
||||
return classifier
|
||||
|
||||
|
||||
def check_megam_config():
|
||||
"""
|
||||
Checks whether the MEGAM binary is configured.
|
||||
"""
|
||||
try:
|
||||
_megam_bin
|
||||
except NameError as e:
|
||||
err_msg = str(
|
||||
"Please configure your megam binary first, e.g.\n"
|
||||
">>> nltk.config_megam('/usr/bin/local/megam')"
|
||||
)
|
||||
raise NameError(err_msg) from e
|
||||
377
backend/venv/Lib/site-packages/nltk/classify/weka.py
Normal file
377
backend/venv/Lib/site-packages/nltk/classify/weka.py
Normal file
@@ -0,0 +1,377 @@
|
||||
# Natural Language Toolkit: Interface to Weka Classsifiers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Classifiers that make use of the external 'Weka' package.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import zipfile
|
||||
from sys import stdin
|
||||
|
||||
from nltk.classify.api import ClassifierI
|
||||
from nltk.internals import config_java, java
|
||||
from nltk.probability import DictionaryProbDist
|
||||
|
||||
_weka_classpath = None
|
||||
_weka_search = [
|
||||
".",
|
||||
"/usr/share/weka",
|
||||
"/usr/local/share/weka",
|
||||
"/usr/lib/weka",
|
||||
"/usr/local/lib/weka",
|
||||
]
|
||||
|
||||
|
||||
def config_weka(classpath=None):
|
||||
global _weka_classpath
|
||||
|
||||
# Make sure java's configured first.
|
||||
config_java()
|
||||
|
||||
if classpath is not None:
|
||||
_weka_classpath = classpath
|
||||
|
||||
if _weka_classpath is None:
|
||||
searchpath = _weka_search
|
||||
if "WEKAHOME" in os.environ:
|
||||
searchpath.insert(0, os.environ["WEKAHOME"])
|
||||
|
||||
for path in searchpath:
|
||||
if os.path.exists(os.path.join(path, "weka.jar")):
|
||||
_weka_classpath = os.path.join(path, "weka.jar")
|
||||
version = _check_weka_version(_weka_classpath)
|
||||
if version:
|
||||
print(f"[Found Weka: {_weka_classpath} (version {version})]")
|
||||
else:
|
||||
print("[Found Weka: %s]" % _weka_classpath)
|
||||
_check_weka_version(_weka_classpath)
|
||||
|
||||
if _weka_classpath is None:
|
||||
raise LookupError(
|
||||
"Unable to find weka.jar! Use config_weka() "
|
||||
"or set the WEKAHOME environment variable. "
|
||||
"For more information about Weka, please see "
|
||||
"https://www.cs.waikato.ac.nz/ml/weka/"
|
||||
)
|
||||
|
||||
|
||||
def _check_weka_version(jar):
|
||||
try:
|
||||
zf = zipfile.ZipFile(jar)
|
||||
except (SystemExit, KeyboardInterrupt):
|
||||
raise
|
||||
except:
|
||||
return None
|
||||
try:
|
||||
try:
|
||||
return zf.read("weka/core/version.txt")
|
||||
except KeyError:
|
||||
return None
|
||||
finally:
|
||||
zf.close()
|
||||
|
||||
|
||||
class WekaClassifier(ClassifierI):
|
||||
def __init__(self, formatter, model_filename):
|
||||
self._formatter = formatter
|
||||
self._model = model_filename
|
||||
|
||||
def prob_classify_many(self, featuresets):
|
||||
return self._classify_many(featuresets, ["-p", "0", "-distribution"])
|
||||
|
||||
def classify_many(self, featuresets):
|
||||
return self._classify_many(featuresets, ["-p", "0"])
|
||||
|
||||
def _classify_many(self, featuresets, options):
|
||||
# Make sure we can find java & weka.
|
||||
config_weka()
|
||||
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
try:
|
||||
# Write the test data file.
|
||||
test_filename = os.path.join(temp_dir, "test.arff")
|
||||
self._formatter.write(test_filename, featuresets)
|
||||
|
||||
# Call weka to classify the data.
|
||||
cmd = [
|
||||
"weka.classifiers.bayes.NaiveBayes",
|
||||
"-l",
|
||||
self._model,
|
||||
"-T",
|
||||
test_filename,
|
||||
] + options
|
||||
(stdout, stderr) = java(
|
||||
cmd,
|
||||
classpath=_weka_classpath,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
|
||||
# Check if something went wrong:
|
||||
if stderr and not stdout:
|
||||
if "Illegal options: -distribution" in stderr:
|
||||
raise ValueError(
|
||||
"The installed version of weka does "
|
||||
"not support probability distribution "
|
||||
"output."
|
||||
)
|
||||
else:
|
||||
raise ValueError("Weka failed to generate output:\n%s" % stderr)
|
||||
|
||||
# Parse weka's output.
|
||||
return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n"))
|
||||
|
||||
finally:
|
||||
for f in os.listdir(temp_dir):
|
||||
os.remove(os.path.join(temp_dir, f))
|
||||
os.rmdir(temp_dir)
|
||||
|
||||
def parse_weka_distribution(self, s):
|
||||
probs = [float(v) for v in re.split("[*,]+", s) if v.strip()]
|
||||
probs = dict(zip(self._formatter.labels(), probs))
|
||||
return DictionaryProbDist(probs)
|
||||
|
||||
def parse_weka_output(self, lines):
|
||||
# Strip unwanted text from stdout
|
||||
for i, line in enumerate(lines):
|
||||
if line.strip().startswith("inst#"):
|
||||
lines = lines[i:]
|
||||
break
|
||||
|
||||
if lines[0].split() == ["inst#", "actual", "predicted", "error", "prediction"]:
|
||||
return [line.split()[2].split(":")[1] for line in lines[1:] if line.strip()]
|
||||
elif lines[0].split() == [
|
||||
"inst#",
|
||||
"actual",
|
||||
"predicted",
|
||||
"error",
|
||||
"distribution",
|
||||
]:
|
||||
return [
|
||||
self.parse_weka_distribution(line.split()[-1])
|
||||
for line in lines[1:]
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
# is this safe:?
|
||||
elif re.match(r"^0 \w+ [01]\.[0-9]* \?\s*$", lines[0]):
|
||||
return [line.split()[1] for line in lines if line.strip()]
|
||||
|
||||
else:
|
||||
for line in lines[:10]:
|
||||
print(line)
|
||||
raise ValueError(
|
||||
"Unhandled output format -- your version "
|
||||
"of weka may not be supported.\n"
|
||||
" Header: %s" % lines[0]
|
||||
)
|
||||
|
||||
# [xx] full list of classifiers (some may be abstract?):
|
||||
# ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule,
|
||||
# DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48,
|
||||
# JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic,
|
||||
# LogisticBase, M5Base, MultilayerPerceptron,
|
||||
# MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial,
|
||||
# NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART,
|
||||
# PreConstructedLinearModel, Prism, RandomForest,
|
||||
# RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor,
|
||||
# RuleNode, SimpleLinearRegression, SimpleLogistic,
|
||||
# SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI,
|
||||
# VotedPerceptron, Winnow, ZeroR
|
||||
|
||||
_CLASSIFIER_CLASS = {
|
||||
"naivebayes": "weka.classifiers.bayes.NaiveBayes",
|
||||
"C4.5": "weka.classifiers.trees.J48",
|
||||
"log_regression": "weka.classifiers.functions.Logistic",
|
||||
"svm": "weka.classifiers.functions.SMO",
|
||||
"kstar": "weka.classifiers.lazy.KStar",
|
||||
"ripper": "weka.classifiers.rules.JRip",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def train(
|
||||
cls,
|
||||
model_filename,
|
||||
featuresets,
|
||||
classifier="naivebayes",
|
||||
options=[],
|
||||
quiet=True,
|
||||
):
|
||||
# Make sure we can find java & weka.
|
||||
config_weka()
|
||||
|
||||
# Build an ARFF formatter.
|
||||
formatter = ARFF_Formatter.from_train(featuresets)
|
||||
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
try:
|
||||
# Write the training data file.
|
||||
train_filename = os.path.join(temp_dir, "train.arff")
|
||||
formatter.write(train_filename, featuresets)
|
||||
|
||||
if classifier in cls._CLASSIFIER_CLASS:
|
||||
javaclass = cls._CLASSIFIER_CLASS[classifier]
|
||||
elif classifier in cls._CLASSIFIER_CLASS.values():
|
||||
javaclass = classifier
|
||||
else:
|
||||
raise ValueError("Unknown classifier %s" % classifier)
|
||||
|
||||
# Train the weka model.
|
||||
cmd = [javaclass, "-d", model_filename, "-t", train_filename]
|
||||
cmd += list(options)
|
||||
if quiet:
|
||||
stdout = subprocess.PIPE
|
||||
else:
|
||||
stdout = None
|
||||
java(cmd, classpath=_weka_classpath, stdout=stdout)
|
||||
|
||||
# Return the new classifier.
|
||||
return WekaClassifier(formatter, model_filename)
|
||||
|
||||
finally:
|
||||
for f in os.listdir(temp_dir):
|
||||
os.remove(os.path.join(temp_dir, f))
|
||||
os.rmdir(temp_dir)
|
||||
|
||||
|
||||
class ARFF_Formatter:
|
||||
"""
|
||||
Converts featuresets and labeled featuresets to ARFF-formatted
|
||||
strings, appropriate for input into Weka.
|
||||
|
||||
Features and classes can be specified manually in the constructor, or may
|
||||
be determined from data using ``from_train``.
|
||||
"""
|
||||
|
||||
def __init__(self, labels, features):
|
||||
"""
|
||||
:param labels: A list of all class labels that can be generated.
|
||||
:param features: A list of feature specifications, where
|
||||
each feature specification is a tuple (fname, ftype);
|
||||
and ftype is an ARFF type string such as NUMERIC or
|
||||
STRING.
|
||||
"""
|
||||
self._labels = labels
|
||||
self._features = features
|
||||
|
||||
def format(self, tokens):
|
||||
"""Returns a string representation of ARFF output for the given data."""
|
||||
return self.header_section() + self.data_section(tokens)
|
||||
|
||||
def labels(self):
|
||||
"""Returns the list of classes."""
|
||||
return list(self._labels)
|
||||
|
||||
def write(self, outfile, tokens):
|
||||
"""Writes ARFF data to a file for the given data."""
|
||||
if not hasattr(outfile, "write"):
|
||||
outfile = open(outfile, "w")
|
||||
outfile.write(self.format(tokens))
|
||||
outfile.close()
|
||||
|
||||
@staticmethod
|
||||
def from_train(tokens):
|
||||
"""
|
||||
Constructs an ARFF_Formatter instance with class labels and feature
|
||||
types determined from the given data. Handles boolean, numeric and
|
||||
string (note: not nominal) types.
|
||||
"""
|
||||
# Find the set of all attested labels.
|
||||
labels = {label for (tok, label) in tokens}
|
||||
|
||||
# Determine the types of all features.
|
||||
features = {}
|
||||
for tok, label in tokens:
|
||||
for fname, fval in tok.items():
|
||||
if issubclass(type(fval), bool):
|
||||
ftype = "{True, False}"
|
||||
elif issubclass(type(fval), (int, float, bool)):
|
||||
ftype = "NUMERIC"
|
||||
elif issubclass(type(fval), str):
|
||||
ftype = "STRING"
|
||||
elif fval is None:
|
||||
continue # can't tell the type.
|
||||
else:
|
||||
raise ValueError("Unsupported value type %r" % ftype)
|
||||
|
||||
if features.get(fname, ftype) != ftype:
|
||||
raise ValueError("Inconsistent type for %s" % fname)
|
||||
features[fname] = ftype
|
||||
features = sorted(features.items())
|
||||
|
||||
return ARFF_Formatter(labels, features)
|
||||
|
||||
def header_section(self):
|
||||
"""Returns an ARFF header as a string."""
|
||||
# Header comment.
|
||||
s = (
|
||||
"% Weka ARFF file\n"
|
||||
+ "% Generated automatically by NLTK\n"
|
||||
+ "%% %s\n\n" % time.ctime()
|
||||
)
|
||||
|
||||
# Relation name
|
||||
s += "@RELATION rel\n\n"
|
||||
|
||||
# Input attribute specifications
|
||||
for fname, ftype in self._features:
|
||||
s += "@ATTRIBUTE %-30r %s\n" % (fname, ftype)
|
||||
|
||||
# Label attribute specification
|
||||
s += "@ATTRIBUTE %-30r {%s}\n" % ("-label-", ",".join(self._labels))
|
||||
|
||||
return s
|
||||
|
||||
def data_section(self, tokens, labeled=None):
|
||||
"""
|
||||
Returns the ARFF data section for the given data.
|
||||
|
||||
:param tokens: a list of featuresets (dicts) or labelled featuresets
|
||||
which are tuples (featureset, label).
|
||||
:param labeled: Indicates whether the given tokens are labeled
|
||||
or not. If None, then the tokens will be assumed to be
|
||||
labeled if the first token's value is a tuple or list.
|
||||
"""
|
||||
# Check if the tokens are labeled or unlabeled. If unlabeled,
|
||||
# then use 'None'
|
||||
if labeled is None:
|
||||
labeled = tokens and isinstance(tokens[0], (tuple, list))
|
||||
if not labeled:
|
||||
tokens = [(tok, None) for tok in tokens]
|
||||
|
||||
# Data section
|
||||
s = "\n@DATA\n"
|
||||
for tok, label in tokens:
|
||||
for fname, ftype in self._features:
|
||||
s += "%s," % self._fmt_arff_val(tok.get(fname))
|
||||
s += "%s\n" % self._fmt_arff_val(label)
|
||||
|
||||
return s
|
||||
|
||||
def _fmt_arff_val(self, fval):
|
||||
if fval is None:
|
||||
return "?"
|
||||
elif isinstance(fval, (bool, int)):
|
||||
return "%s" % fval
|
||||
elif isinstance(fval, float):
|
||||
return "%r" % fval
|
||||
else:
|
||||
return "%r" % fval
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from nltk.classify.util import binary_names_demo_features, names_demo
|
||||
|
||||
def make_classifier(featuresets):
|
||||
return WekaClassifier.train("/tmp/name.model", featuresets, "C4.5")
|
||||
|
||||
classifier = names_demo(make_classifier, binary_names_demo_features)
|
||||
Reference in New Issue
Block a user