Initial commit
This commit is contained in:
51
backend/venv/Lib/site-packages/nltk/metrics/__init__.py
Normal file
51
backend/venv/Lib/site-packages/nltk/metrics/__init__.py
Normal file
@@ -0,0 +1,51 @@
|
||||
# Natural Language Toolkit: Metrics
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
"""
|
||||
NLTK Metrics
|
||||
|
||||
Classes and methods for scoring processing modules.
|
||||
"""
|
||||
|
||||
from nltk.metrics.agreement import AnnotationTask
|
||||
from nltk.metrics.aline import align
|
||||
from nltk.metrics.association import (
|
||||
BigramAssocMeasures,
|
||||
ContingencyMeasures,
|
||||
NgramAssocMeasures,
|
||||
QuadgramAssocMeasures,
|
||||
TrigramAssocMeasures,
|
||||
)
|
||||
from nltk.metrics.confusionmatrix import ConfusionMatrix
|
||||
from nltk.metrics.distance import (
|
||||
binary_distance,
|
||||
custom_distance,
|
||||
edit_distance,
|
||||
edit_distance_align,
|
||||
fractional_presence,
|
||||
interval_distance,
|
||||
jaccard_distance,
|
||||
masi_distance,
|
||||
presence,
|
||||
)
|
||||
from nltk.metrics.paice import Paice
|
||||
from nltk.metrics.scores import (
|
||||
accuracy,
|
||||
approxrand,
|
||||
f_measure,
|
||||
log_likelihood,
|
||||
precision,
|
||||
recall,
|
||||
)
|
||||
from nltk.metrics.segmentation import ghd, pk, windowdiff
|
||||
from nltk.metrics.spearman import (
|
||||
ranks_from_scores,
|
||||
ranks_from_sequence,
|
||||
spearman_correlation,
|
||||
)
|
||||
467
backend/venv/Lib/site-packages/nltk/metrics/agreement.py
Normal file
467
backend/venv/Lib/site-packages/nltk/metrics/agreement.py
Normal file
@@ -0,0 +1,467 @@
|
||||
# Natural Language Toolkit: Agreement Metrics
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tom Lippincott <tom@cs.columbia.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
"""
|
||||
Implementations of inter-annotator agreement coefficients surveyed by Artstein
|
||||
and Poesio (2007), Inter-Coder Agreement for Computational Linguistics.
|
||||
|
||||
An agreement coefficient calculates the amount that annotators agreed on label
|
||||
assignments beyond what is expected by chance.
|
||||
|
||||
In defining the AnnotationTask class, we use naming conventions similar to the
|
||||
paper's terminology. There are three types of objects in an annotation task:
|
||||
|
||||
the coders (variables "c" and "C")
|
||||
the items to be annotated (variables "i" and "I")
|
||||
the potential categories to be assigned (variables "k" and "K")
|
||||
|
||||
Additionally, it is often the case that we don't want to treat two different
|
||||
labels as complete disagreement, and so the AnnotationTask constructor can also
|
||||
take a distance metric as a final argument. Distance metrics are simply
|
||||
functions that take two arguments, and return a value between 0.0 and 1.0
|
||||
indicating the distance between them. If not supplied, the default is binary
|
||||
comparison between the arguments.
|
||||
|
||||
The simplest way to initialize an AnnotationTask is with a list of triples,
|
||||
each containing a coder's assignment for one object in the task:
|
||||
|
||||
task = AnnotationTask(data=[('c1', '1', 'v1'),('c2', '1', 'v1'),...])
|
||||
|
||||
Note that the data list needs to contain the same number of triples for each
|
||||
individual coder, containing category values for the same set of items.
|
||||
|
||||
Alpha (Krippendorff 1980)
|
||||
Kappa (Cohen 1960)
|
||||
S (Bennet, Albert and Goldstein 1954)
|
||||
Pi (Scott 1955)
|
||||
|
||||
|
||||
TODO: Describe handling of multiple coders and missing data
|
||||
|
||||
Expected results from the Artstein and Poesio survey paper:
|
||||
|
||||
>>> from nltk.metrics.agreement import AnnotationTask
|
||||
>>> import os.path
|
||||
>>> t = AnnotationTask(data=[x.split() for x in open(os.path.join(os.path.dirname(__file__), "artstein_poesio_example.txt"))])
|
||||
>>> t.avg_Ao()
|
||||
0.88
|
||||
>>> round(t.pi(), 5)
|
||||
0.79953
|
||||
>>> round(t.S(), 2)
|
||||
0.82
|
||||
|
||||
This would have returned a wrong value (0.0) in @785fb79 as coders are in
|
||||
the wrong order. Subsequently, all values for pi(), S(), and kappa() would
|
||||
have been wrong as they are computed with avg_Ao().
|
||||
>>> t2 = AnnotationTask(data=[('b','1','stat'),('a','1','stat')])
|
||||
>>> t2.avg_Ao()
|
||||
1.0
|
||||
|
||||
The following, of course, also works.
|
||||
>>> t3 = AnnotationTask(data=[('a','1','othr'),('b','1','othr')])
|
||||
>>> t3.avg_Ao()
|
||||
1.0
|
||||
|
||||
"""
|
||||
|
||||
import logging
|
||||
from itertools import groupby
|
||||
from operator import itemgetter
|
||||
|
||||
from nltk.internals import deprecated
|
||||
from nltk.metrics.distance import binary_distance
|
||||
from nltk.probability import ConditionalFreqDist, FreqDist
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AnnotationTask:
|
||||
"""Represents an annotation task, i.e. people assign labels to items.
|
||||
|
||||
Notation tries to match notation in Artstein and Poesio (2007).
|
||||
|
||||
In general, coders and items can be represented as any hashable object.
|
||||
Integers, for example, are fine, though strings are more readable.
|
||||
Labels must support the distance functions applied to them, so e.g.
|
||||
a string-edit-distance makes no sense if your labels are integers,
|
||||
whereas interval distance needs numeric values. A notable case of this
|
||||
is the MASI metric, which requires Python sets.
|
||||
"""
|
||||
|
||||
def __init__(self, data=None, distance=binary_distance):
|
||||
"""Initialize an annotation task.
|
||||
|
||||
The data argument can be None (to create an empty annotation task) or a sequence of 3-tuples,
|
||||
each representing a coder's labeling of an item:
|
||||
``(coder,item,label)``
|
||||
|
||||
The distance argument is a function taking two arguments (labels) and producing a numerical distance.
|
||||
The distance from a label to itself should be zero:
|
||||
``distance(l,l) = 0``
|
||||
"""
|
||||
self.distance = distance
|
||||
self.I = set()
|
||||
self.K = set()
|
||||
self.C = set()
|
||||
self.data = []
|
||||
if data is not None:
|
||||
self.load_array(data)
|
||||
|
||||
def __str__(self):
|
||||
return "\r\n".join(
|
||||
map(
|
||||
lambda x: "%s\t%s\t%s"
|
||||
% (x["coder"], x["item"].replace("_", "\t"), ",".join(x["labels"])),
|
||||
self.data,
|
||||
)
|
||||
)
|
||||
|
||||
def load_array(self, array):
|
||||
"""Load an sequence of annotation results, appending to any data already loaded.
|
||||
|
||||
The argument is a sequence of 3-tuples, each representing a coder's labeling of an item:
|
||||
(coder,item,label)
|
||||
"""
|
||||
for coder, item, labels in array:
|
||||
self.C.add(coder)
|
||||
self.K.add(labels)
|
||||
self.I.add(item)
|
||||
self.data.append({"coder": coder, "labels": labels, "item": item})
|
||||
|
||||
def agr(self, cA, cB, i, data=None):
|
||||
"""Agreement between two coders on a given item"""
|
||||
data = data or self.data
|
||||
# cfedermann: we don't know what combination of coder/item will come
|
||||
# first in x; to avoid StopIteration problems due to assuming an order
|
||||
# cA,cB, we allow either for k1 and then look up the missing as k2.
|
||||
k1 = next(x for x in data if x["coder"] in (cA, cB) and x["item"] == i)
|
||||
if k1["coder"] == cA:
|
||||
k2 = next(x for x in data if x["coder"] == cB and x["item"] == i)
|
||||
else:
|
||||
k2 = next(x for x in data if x["coder"] == cA and x["item"] == i)
|
||||
|
||||
ret = 1.0 - float(self.distance(k1["labels"], k2["labels"]))
|
||||
log.debug("Observed agreement between %s and %s on %s: %f", cA, cB, i, ret)
|
||||
log.debug(
|
||||
'Distance between "%r" and "%r": %f', k1["labels"], k2["labels"], 1.0 - ret
|
||||
)
|
||||
return ret
|
||||
|
||||
def Nk(self, k):
|
||||
return float(sum(1 for x in self.data if x["labels"] == k))
|
||||
|
||||
def Nik(self, i, k):
|
||||
return float(sum(1 for x in self.data if x["item"] == i and x["labels"] == k))
|
||||
|
||||
def Nck(self, c, k):
|
||||
return float(sum(1 for x in self.data if x["coder"] == c and x["labels"] == k))
|
||||
|
||||
@deprecated("Use Nk, Nik or Nck instead")
|
||||
def N(self, k=None, i=None, c=None):
|
||||
"""Implements the "n-notation" used in Artstein and Poesio (2007)"""
|
||||
if k is not None and i is None and c is None:
|
||||
ret = self.Nk(k)
|
||||
elif k is not None and i is not None and c is None:
|
||||
ret = self.Nik(i, k)
|
||||
elif k is not None and c is not None and i is None:
|
||||
ret = self.Nck(c, k)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"You must pass either i or c, not both! (k={k!r},i={i!r},c={c!r})"
|
||||
)
|
||||
log.debug("Count on N[%s,%s,%s]: %d", k, i, c, ret)
|
||||
return ret
|
||||
|
||||
def _grouped_data(self, field, data=None):
|
||||
data = data or self.data
|
||||
return groupby(sorted(data, key=itemgetter(field)), itemgetter(field))
|
||||
|
||||
def Ao(self, cA, cB):
|
||||
"""Observed agreement between two coders on all items."""
|
||||
data = self._grouped_data(
|
||||
"item", (x for x in self.data if x["coder"] in (cA, cB))
|
||||
)
|
||||
ret = sum(self.agr(cA, cB, item, item_data) for item, item_data in data) / len(
|
||||
self.I
|
||||
)
|
||||
log.debug("Observed agreement between %s and %s: %f", cA, cB, ret)
|
||||
return ret
|
||||
|
||||
def _pairwise_average(self, function):
|
||||
"""
|
||||
Calculates the average of function results for each coder pair
|
||||
"""
|
||||
total = 0
|
||||
n = 0
|
||||
s = self.C.copy()
|
||||
for cA in self.C:
|
||||
s.remove(cA)
|
||||
for cB in s:
|
||||
total += function(cA, cB)
|
||||
n += 1
|
||||
ret = total / n
|
||||
return ret
|
||||
|
||||
def avg_Ao(self):
|
||||
"""Average observed agreement across all coders and items."""
|
||||
ret = self._pairwise_average(self.Ao)
|
||||
log.debug("Average observed agreement: %f", ret)
|
||||
return ret
|
||||
|
||||
def Do_Kw_pairwise(self, cA, cB, max_distance=1.0):
|
||||
"""The observed disagreement for the weighted kappa coefficient."""
|
||||
total = 0.0
|
||||
data = (x for x in self.data if x["coder"] in (cA, cB))
|
||||
for i, itemdata in self._grouped_data("item", data):
|
||||
# we should have two items; distance doesn't care which comes first
|
||||
total += self.distance(next(itemdata)["labels"], next(itemdata)["labels"])
|
||||
|
||||
ret = total / (len(self.I) * max_distance)
|
||||
log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret)
|
||||
return ret
|
||||
|
||||
def Do_Kw(self, max_distance=1.0):
|
||||
"""Averaged over all labelers"""
|
||||
ret = self._pairwise_average(
|
||||
lambda cA, cB: self.Do_Kw_pairwise(cA, cB, max_distance)
|
||||
)
|
||||
log.debug("Observed disagreement: %f", ret)
|
||||
return ret
|
||||
|
||||
# Agreement Coefficients
|
||||
def S(self):
|
||||
"""Bennett, Albert and Goldstein 1954"""
|
||||
Ae = 1.0 / len(self.K)
|
||||
ret = (self.avg_Ao() - Ae) / (1.0 - Ae)
|
||||
return ret
|
||||
|
||||
def pi(self):
|
||||
"""Scott 1955; here, multi-pi.
|
||||
Equivalent to K from Siegel and Castellan (1988).
|
||||
|
||||
"""
|
||||
total = 0.0
|
||||
label_freqs = FreqDist(x["labels"] for x in self.data)
|
||||
for k, f in label_freqs.items():
|
||||
total += f**2
|
||||
Ae = total / ((len(self.I) * len(self.C)) ** 2)
|
||||
return (self.avg_Ao() - Ae) / (1 - Ae)
|
||||
|
||||
def Ae_kappa(self, cA, cB):
|
||||
Ae = 0.0
|
||||
nitems = float(len(self.I))
|
||||
label_freqs = ConditionalFreqDist((x["labels"], x["coder"]) for x in self.data)
|
||||
for k in label_freqs.conditions():
|
||||
Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
|
||||
return Ae
|
||||
|
||||
def kappa_pairwise(self, cA, cB):
|
||||
""" """
|
||||
Ae = self.Ae_kappa(cA, cB)
|
||||
ret = (self.Ao(cA, cB) - Ae) / (1.0 - Ae)
|
||||
log.debug("Expected agreement between %s and %s: %f", cA, cB, Ae)
|
||||
return ret
|
||||
|
||||
def kappa(self):
|
||||
"""Cohen 1960
|
||||
Averages naively over kappas for each coder pair.
|
||||
|
||||
"""
|
||||
return self._pairwise_average(self.kappa_pairwise)
|
||||
|
||||
def multi_kappa(self):
|
||||
"""Davies and Fleiss 1982
|
||||
Averages over observed and expected agreements for each coder pair.
|
||||
|
||||
"""
|
||||
Ae = self._pairwise_average(self.Ae_kappa)
|
||||
return (self.avg_Ao() - Ae) / (1.0 - Ae)
|
||||
|
||||
def Disagreement(self, label_freqs):
|
||||
total_labels = sum(label_freqs.values())
|
||||
pairs = 0.0
|
||||
for j, nj in label_freqs.items():
|
||||
for l, nl in label_freqs.items():
|
||||
pairs += float(nj * nl) * self.distance(l, j)
|
||||
return 1.0 * pairs / (total_labels * (total_labels - 1))
|
||||
|
||||
def alpha(self):
|
||||
"""Krippendorff 1980"""
|
||||
# check for degenerate cases
|
||||
if len(self.K) == 0:
|
||||
raise ValueError("Cannot calculate alpha, no data present!")
|
||||
if len(self.K) == 1:
|
||||
log.debug("Only one annotation value, alpha returning 1.")
|
||||
return 1
|
||||
if len(self.C) == 1 and len(self.I) == 1:
|
||||
raise ValueError("Cannot calculate alpha, only one coder and item present!")
|
||||
|
||||
total_disagreement = 0.0
|
||||
total_ratings = 0
|
||||
all_valid_labels_freq = FreqDist([])
|
||||
total_do = 0.0 # Total observed disagreement for all items.
|
||||
for i, itemdata in self._grouped_data("item"):
|
||||
label_freqs = FreqDist(x["labels"] for x in itemdata)
|
||||
labels_count = sum(label_freqs.values())
|
||||
if labels_count < 2:
|
||||
# Ignore the item.
|
||||
continue
|
||||
all_valid_labels_freq += label_freqs
|
||||
total_do += self.Disagreement(label_freqs) * labels_count
|
||||
|
||||
if len(all_valid_labels_freq.keys()) == 1:
|
||||
log.debug("Only one valid annotation value, alpha returning 1.")
|
||||
return 1
|
||||
|
||||
do = total_do / sum(all_valid_labels_freq.values())
|
||||
|
||||
de = self.Disagreement(all_valid_labels_freq) # Expected disagreement.
|
||||
k_alpha = 1.0 - do / de
|
||||
|
||||
return k_alpha
|
||||
|
||||
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
|
||||
"""Cohen 1968"""
|
||||
total = 0.0
|
||||
label_freqs = ConditionalFreqDist(
|
||||
(x["coder"], x["labels"]) for x in self.data if x["coder"] in (cA, cB)
|
||||
)
|
||||
for j in self.K:
|
||||
for l in self.K:
|
||||
total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
|
||||
De = total / (max_distance * pow(len(self.I), 2))
|
||||
log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
|
||||
Do = self.Do_Kw_pairwise(cA, cB)
|
||||
ret = 1.0 - (Do / De)
|
||||
return ret
|
||||
|
||||
def weighted_kappa(self, max_distance=1.0):
|
||||
"""Cohen 1968"""
|
||||
return self._pairwise_average(
|
||||
lambda cA, cB: self.weighted_kappa_pairwise(cA, cB, max_distance)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import optparse
|
||||
import re
|
||||
|
||||
from nltk.metrics import distance
|
||||
|
||||
# process command-line arguments
|
||||
parser = optparse.OptionParser()
|
||||
parser.add_option(
|
||||
"-d",
|
||||
"--distance",
|
||||
dest="distance",
|
||||
default="binary_distance",
|
||||
help="distance metric to use",
|
||||
)
|
||||
parser.add_option(
|
||||
"-a",
|
||||
"--agreement",
|
||||
dest="agreement",
|
||||
default="kappa",
|
||||
help="agreement coefficient to calculate",
|
||||
)
|
||||
parser.add_option(
|
||||
"-e",
|
||||
"--exclude",
|
||||
dest="exclude",
|
||||
action="append",
|
||||
default=[],
|
||||
help="coder names to exclude (may be specified multiple times)",
|
||||
)
|
||||
parser.add_option(
|
||||
"-i",
|
||||
"--include",
|
||||
dest="include",
|
||||
action="append",
|
||||
default=[],
|
||||
help="coder names to include, same format as exclude",
|
||||
)
|
||||
parser.add_option(
|
||||
"-f",
|
||||
"--file",
|
||||
dest="file",
|
||||
help="file to read labelings from, each line with three columns: 'labeler item labels'",
|
||||
)
|
||||
parser.add_option(
|
||||
"-v",
|
||||
"--verbose",
|
||||
dest="verbose",
|
||||
default="0",
|
||||
help="how much debugging to print on stderr (0-4)",
|
||||
)
|
||||
parser.add_option(
|
||||
"-c",
|
||||
"--columnsep",
|
||||
dest="columnsep",
|
||||
default="\t",
|
||||
help="char/string that separates the three columns in the file, defaults to tab",
|
||||
)
|
||||
parser.add_option(
|
||||
"-l",
|
||||
"--labelsep",
|
||||
dest="labelsep",
|
||||
default=",",
|
||||
help="char/string that separates labels (if labelers can assign more than one), defaults to comma",
|
||||
)
|
||||
parser.add_option(
|
||||
"-p",
|
||||
"--presence",
|
||||
dest="presence",
|
||||
default=None,
|
||||
help="convert each labeling into 1 or 0, based on presence of LABEL",
|
||||
)
|
||||
parser.add_option(
|
||||
"-T",
|
||||
"--thorough",
|
||||
dest="thorough",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="calculate agreement for every subset of the annotators",
|
||||
)
|
||||
(options, remainder) = parser.parse_args()
|
||||
|
||||
if not options.file:
|
||||
parser.print_help()
|
||||
exit()
|
||||
|
||||
logging.basicConfig(level=50 - 10 * int(options.verbose))
|
||||
|
||||
# read in data from the specified file
|
||||
data = []
|
||||
with open(options.file) as infile:
|
||||
for l in infile:
|
||||
toks = l.split(options.columnsep)
|
||||
coder, object_, labels = (
|
||||
toks[0],
|
||||
str(toks[1:-1]),
|
||||
frozenset(toks[-1].strip().split(options.labelsep)),
|
||||
)
|
||||
if (
|
||||
(options.include == options.exclude)
|
||||
or (len(options.include) > 0 and coder in options.include)
|
||||
or (len(options.exclude) > 0 and coder not in options.exclude)
|
||||
):
|
||||
data.append((coder, object_, labels))
|
||||
|
||||
if options.presence:
|
||||
task = AnnotationTask(
|
||||
data, getattr(distance, options.distance)(options.presence)
|
||||
)
|
||||
else:
|
||||
task = AnnotationTask(data, getattr(distance, options.distance))
|
||||
|
||||
if options.thorough:
|
||||
pass
|
||||
else:
|
||||
print(getattr(task, options.agreement)())
|
||||
|
||||
logging.shutdown()
|
||||
1597
backend/venv/Lib/site-packages/nltk/metrics/aline.py
Normal file
1597
backend/venv/Lib/site-packages/nltk/metrics/aline.py
Normal file
File diff suppressed because it is too large
Load Diff
476
backend/venv/Lib/site-packages/nltk/metrics/association.py
Normal file
476
backend/venv/Lib/site-packages/nltk/metrics/association.py
Normal file
@@ -0,0 +1,476 @@
|
||||
# Natural Language Toolkit: Ngram Association Measures
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Provides scoring functions for a number of association measures through a
|
||||
generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
|
||||
``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
|
||||
"""
|
||||
|
||||
import math as _math
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from functools import reduce
|
||||
|
||||
_log2 = lambda x: _math.log2(x)
|
||||
_ln = _math.log
|
||||
|
||||
_product = lambda s: reduce(lambda x, y: x * y, s)
|
||||
|
||||
_SMALL = 1e-20
|
||||
|
||||
try:
|
||||
from scipy.stats import fisher_exact
|
||||
except ImportError:
|
||||
|
||||
def fisher_exact(*_args, **_kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
### Indices to marginals arguments:
|
||||
|
||||
NGRAM = 0
|
||||
"""Marginals index for the ngram count"""
|
||||
|
||||
UNIGRAMS = -2
|
||||
"""Marginals index for a tuple of each unigram count"""
|
||||
|
||||
TOTAL = -1
|
||||
"""Marginals index for the number of words in the data"""
|
||||
|
||||
|
||||
class NgramAssocMeasures(metaclass=ABCMeta):
|
||||
"""
|
||||
An abstract class defining a collection of generic association measures.
|
||||
Each public method returns a score, taking the following arguments::
|
||||
|
||||
score_fn(count_of_ngram,
|
||||
(count_of_n-1gram_1, ..., count_of_n-1gram_j),
|
||||
(count_of_n-2gram_1, ..., count_of_n-2gram_k),
|
||||
...,
|
||||
(count_of_1gram_1, ..., count_of_1gram_n),
|
||||
count_of_total_words)
|
||||
|
||||
See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``
|
||||
|
||||
Inheriting classes should define a property _n, and a method _contingency
|
||||
which calculates contingency values from marginals in order for all
|
||||
association measures defined here to be usable.
|
||||
"""
|
||||
|
||||
_n = 0
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def _contingency(*marginals):
|
||||
"""Calculates values of a contingency table from marginal values."""
|
||||
raise NotImplementedError(
|
||||
"The contingency table is not available" "in the general ngram case"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def _marginals(*contingency):
|
||||
"""Calculates values of contingency table marginals from its values."""
|
||||
raise NotImplementedError(
|
||||
"The contingency table is not available" "in the general ngram case"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _expected_values(cls, cont):
|
||||
"""Calculates expected values for a contingency table."""
|
||||
n_all = sum(cont)
|
||||
bits = [1 << i for i in range(cls._n)]
|
||||
|
||||
# For each contingency table cell
|
||||
for i in range(len(cont)):
|
||||
# Yield the expected value
|
||||
yield (
|
||||
_product(
|
||||
sum(cont[x] for x in range(2**cls._n) if (x & j) == (i & j))
|
||||
for j in bits
|
||||
)
|
||||
/ (n_all ** (cls._n - 1))
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def raw_freq(*marginals):
|
||||
"""Scores ngrams by their frequency"""
|
||||
return marginals[NGRAM] / marginals[TOTAL]
|
||||
|
||||
@classmethod
|
||||
def student_t(cls, *marginals):
|
||||
"""Scores ngrams using Student's t test with independence hypothesis
|
||||
for unigrams, as in Manning and Schutze 5.3.1.
|
||||
"""
|
||||
return (
|
||||
marginals[NGRAM]
|
||||
- _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
|
||||
) / (marginals[NGRAM] + _SMALL) ** 0.5
|
||||
|
||||
@classmethod
|
||||
def chi_sq(cls, *marginals):
|
||||
"""Scores ngrams using Pearson's chi-square as in Manning and Schutze
|
||||
5.3.3.
|
||||
"""
|
||||
cont = cls._contingency(*marginals)
|
||||
exps = cls._expected_values(cont)
|
||||
return sum((obs - exp) ** 2 / (exp + _SMALL) for obs, exp in zip(cont, exps))
|
||||
|
||||
@staticmethod
|
||||
def mi_like(*marginals, **kwargs):
|
||||
"""Scores ngrams using a variant of mutual information. The keyword
|
||||
argument power sets an exponent (default 3) for the numerator. No
|
||||
logarithm of the result is calculated.
|
||||
"""
|
||||
return marginals[NGRAM] ** kwargs.get("power", 3) / _product(
|
||||
marginals[UNIGRAMS]
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def pmi(cls, *marginals):
|
||||
"""Scores ngrams by pointwise mutual information, as in Manning and
|
||||
Schutze 5.4.
|
||||
"""
|
||||
return _log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) - _log2(
|
||||
_product(marginals[UNIGRAMS])
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def likelihood_ratio(cls, *marginals):
|
||||
"""Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4."""
|
||||
cont = cls._contingency(*marginals)
|
||||
return 2 * sum(
|
||||
obs * _ln(obs / (exp + _SMALL) + _SMALL)
|
||||
for obs, exp in zip(cont, cls._expected_values(cont))
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def poisson_stirling(cls, *marginals):
|
||||
"""Scores ngrams using the Poisson-Stirling measure."""
|
||||
exp = _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
|
||||
return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1)
|
||||
|
||||
@classmethod
|
||||
def jaccard(cls, *marginals):
|
||||
"""Scores ngrams using the Jaccard index."""
|
||||
cont = cls._contingency(*marginals)
|
||||
return cont[0] / sum(cont[:-1])
|
||||
|
||||
|
||||
class BigramAssocMeasures(NgramAssocMeasures):
|
||||
"""
|
||||
A collection of bigram association measures. Each association measure
|
||||
is provided as a function with three arguments::
|
||||
|
||||
bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)
|
||||
|
||||
The arguments constitute the marginals of a contingency table, counting
|
||||
the occurrences of particular events in a corpus. The letter i in the
|
||||
suffix refers to the appearance of the word in question, while x indicates
|
||||
the appearance of any word. Thus, for example:
|
||||
|
||||
- n_ii counts ``(w1, w2)``, i.e. the bigram being scored
|
||||
- n_ix counts ``(w1, *)``
|
||||
- n_xi counts ``(*, w2)``
|
||||
- n_xx counts ``(*, *)``, i.e. any bigram
|
||||
|
||||
This may be shown with respect to a contingency table::
|
||||
|
||||
w1 ~w1
|
||||
------ ------
|
||||
w2 | n_ii | n_oi | = n_xi
|
||||
------ ------
|
||||
~w2 | n_io | n_oo |
|
||||
------ ------
|
||||
= n_ix TOTAL = n_xx
|
||||
"""
|
||||
|
||||
_n = 2
|
||||
|
||||
@staticmethod
|
||||
def _contingency(n_ii, n_ix_xi_tuple, n_xx):
|
||||
"""Calculates values of a bigram contingency table from marginal values."""
|
||||
(n_ix, n_xi) = n_ix_xi_tuple
|
||||
n_oi = n_xi - n_ii
|
||||
n_io = n_ix - n_ii
|
||||
return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io)
|
||||
|
||||
@staticmethod
|
||||
def _marginals(n_ii, n_oi, n_io, n_oo):
|
||||
"""Calculates values of contingency table marginals from its values."""
|
||||
return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii)
|
||||
|
||||
@staticmethod
|
||||
def _expected_values(cont):
|
||||
"""Calculates expected values for a contingency table."""
|
||||
n_xx = sum(cont)
|
||||
# For each contingency table cell
|
||||
for i in range(4):
|
||||
yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / n_xx
|
||||
|
||||
@classmethod
|
||||
def phi_sq(cls, *marginals):
|
||||
"""Scores bigrams using phi-square, the square of the Pearson correlation
|
||||
coefficient.
|
||||
"""
|
||||
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
|
||||
|
||||
return (n_ii * n_oo - n_io * n_oi) ** 2 / (
|
||||
(n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx):
|
||||
"""Scores bigrams using chi-square, i.e. phi-sq multiplied by the number
|
||||
of bigrams, as in Manning and Schutze 5.3.3.
|
||||
"""
|
||||
(n_ix, n_xi) = n_ix_xi_tuple
|
||||
return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx)
|
||||
|
||||
@classmethod
|
||||
def fisher(cls, *marginals):
|
||||
"""Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
|
||||
sensitive to small counts than PMI or Chi Sq, but also more expensive
|
||||
to compute. Requires scipy.
|
||||
"""
|
||||
|
||||
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
|
||||
|
||||
(odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative="less")
|
||||
return pvalue
|
||||
|
||||
@staticmethod
|
||||
def dice(n_ii, n_ix_xi_tuple, n_xx):
|
||||
"""Scores bigrams using Dice's coefficient."""
|
||||
(n_ix, n_xi) = n_ix_xi_tuple
|
||||
return 2 * n_ii / (n_ix + n_xi)
|
||||
|
||||
|
||||
class TrigramAssocMeasures(NgramAssocMeasures):
|
||||
"""
|
||||
A collection of trigram association measures. Each association measure
|
||||
is provided as a function with four arguments::
|
||||
|
||||
trigram_score_fn(n_iii,
|
||||
(n_iix, n_ixi, n_xii),
|
||||
(n_ixx, n_xix, n_xxi),
|
||||
n_xxx)
|
||||
|
||||
The arguments constitute the marginals of a contingency table, counting
|
||||
the occurrences of particular events in a corpus. The letter i in the
|
||||
suffix refers to the appearance of the word in question, while x indicates
|
||||
the appearance of any word. Thus, for example:
|
||||
|
||||
- n_iii counts ``(w1, w2, w3)``, i.e. the trigram being scored
|
||||
- n_ixx counts ``(w1, *, *)``
|
||||
- n_xxx counts ``(*, *, *)``, i.e. any trigram
|
||||
"""
|
||||
|
||||
_n = 3
|
||||
|
||||
@staticmethod
|
||||
def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx):
|
||||
"""Calculates values of a trigram contingency table (or cube) from
|
||||
marginal values.
|
||||
>>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
|
||||
(1, 0, 0, 0, 0, 72, 0, 1927)
|
||||
"""
|
||||
(n_iix, n_ixi, n_xii) = n_iix_tuple
|
||||
(n_ixx, n_xix, n_xxi) = n_ixx_tuple
|
||||
n_oii = n_xii - n_iii
|
||||
n_ioi = n_ixi - n_iii
|
||||
n_iio = n_iix - n_iii
|
||||
n_ooi = n_xxi - n_iii - n_oii - n_ioi
|
||||
n_oio = n_xix - n_iii - n_oii - n_iio
|
||||
n_ioo = n_ixx - n_iii - n_ioi - n_iio
|
||||
n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo
|
||||
|
||||
return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo)
|
||||
|
||||
@staticmethod
|
||||
def _marginals(*contingency):
|
||||
"""Calculates values of contingency table marginals from its values.
|
||||
>>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
|
||||
(1, (1, 1, 1), (1, 73, 1), 2000)
|
||||
"""
|
||||
n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency
|
||||
return (
|
||||
n_iii,
|
||||
(n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii),
|
||||
(
|
||||
n_iii + n_ioi + n_iio + n_ioo,
|
||||
n_iii + n_oii + n_iio + n_oio,
|
||||
n_iii + n_oii + n_ioi + n_ooi,
|
||||
),
|
||||
sum(contingency),
|
||||
)
|
||||
|
||||
|
||||
class QuadgramAssocMeasures(NgramAssocMeasures):
|
||||
"""
|
||||
A collection of quadgram association measures. Each association measure
|
||||
is provided as a function with five arguments::
|
||||
|
||||
trigram_score_fn(n_iiii,
|
||||
(n_iiix, n_iixi, n_ixii, n_xiii),
|
||||
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
|
||||
(n_ixxx, n_xixx, n_xxix, n_xxxi),
|
||||
n_all)
|
||||
|
||||
The arguments constitute the marginals of a contingency table, counting
|
||||
the occurrences of particular events in a corpus. The letter i in the
|
||||
suffix refers to the appearance of the word in question, while x indicates
|
||||
the appearance of any word. Thus, for example:
|
||||
|
||||
- n_iiii counts ``(w1, w2, w3, w4)``, i.e. the quadgram being scored
|
||||
- n_ixxi counts ``(w1, *, *, w4)``
|
||||
- n_xxxx counts ``(*, *, *, *)``, i.e. any quadgram
|
||||
"""
|
||||
|
||||
_n = 4
|
||||
|
||||
@staticmethod
|
||||
def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx):
|
||||
"""Calculates values of a quadgram contingency table from
|
||||
marginal values.
|
||||
"""
|
||||
(n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple
|
||||
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple
|
||||
(n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple
|
||||
n_oiii = n_xiii - n_iiii
|
||||
n_ioii = n_ixii - n_iiii
|
||||
n_iioi = n_iixi - n_iiii
|
||||
n_ooii = n_xxii - n_iiii - n_oiii - n_ioii
|
||||
n_oioi = n_xixi - n_iiii - n_oiii - n_iioi
|
||||
n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi
|
||||
n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi
|
||||
n_iiio = n_iiix - n_iiii
|
||||
n_oiio = n_xiix - n_iiii - n_oiii - n_iiio
|
||||
n_ioio = n_ixix - n_iiii - n_ioii - n_iiio
|
||||
n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio
|
||||
n_iioo = n_iixx - n_iiii - n_iioi - n_iiio
|
||||
n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo
|
||||
n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio
|
||||
n_oooo = (
|
||||
n_xxxx
|
||||
- n_iiii
|
||||
- n_oiii
|
||||
- n_ioii
|
||||
- n_iioi
|
||||
- n_ooii
|
||||
- n_oioi
|
||||
- n_iooi
|
||||
- n_oooi
|
||||
- n_iiio
|
||||
- n_oiio
|
||||
- n_ioio
|
||||
- n_ooio
|
||||
- n_iioo
|
||||
- n_oioo
|
||||
- n_iooo
|
||||
)
|
||||
|
||||
return (
|
||||
n_iiii,
|
||||
n_oiii,
|
||||
n_ioii,
|
||||
n_ooii,
|
||||
n_iioi,
|
||||
n_oioi,
|
||||
n_iooi,
|
||||
n_oooi,
|
||||
n_iiio,
|
||||
n_oiio,
|
||||
n_ioio,
|
||||
n_ooio,
|
||||
n_iioo,
|
||||
n_oioo,
|
||||
n_iooo,
|
||||
n_oooo,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _marginals(*contingency):
|
||||
"""Calculates values of contingency table marginals from its values.
|
||||
QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
|
||||
(1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
|
||||
"""
|
||||
(
|
||||
n_iiii,
|
||||
n_oiii,
|
||||
n_ioii,
|
||||
n_ooii,
|
||||
n_iioi,
|
||||
n_oioi,
|
||||
n_iooi,
|
||||
n_oooi,
|
||||
n_iiio,
|
||||
n_oiio,
|
||||
n_ioio,
|
||||
n_ooio,
|
||||
n_iioo,
|
||||
n_oioo,
|
||||
n_iooo,
|
||||
n_oooo,
|
||||
) = contingency
|
||||
|
||||
n_iiix = n_iiii + n_iiio
|
||||
n_iixi = n_iiii + n_iioi
|
||||
n_ixii = n_iiii + n_ioii
|
||||
n_xiii = n_iiii + n_oiii
|
||||
|
||||
n_iixx = n_iiii + n_iioi + n_iiio + n_iioo
|
||||
n_ixix = n_iiii + n_ioii + n_iiio + n_ioio
|
||||
n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi
|
||||
n_xixi = n_iiii + n_oiii + n_iioi + n_oioi
|
||||
n_xxii = n_iiii + n_oiii + n_ioii + n_ooii
|
||||
n_xiix = n_iiii + n_oiii + n_iiio + n_oiio
|
||||
|
||||
n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo
|
||||
n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo
|
||||
n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio
|
||||
n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi
|
||||
|
||||
n_all = sum(contingency)
|
||||
|
||||
return (
|
||||
n_iiii,
|
||||
(n_iiix, n_iixi, n_ixii, n_xiii),
|
||||
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
|
||||
(n_ixxx, n_xixx, n_xxix, n_xxxi),
|
||||
n_all,
|
||||
)
|
||||
|
||||
|
||||
class ContingencyMeasures:
|
||||
"""Wraps NgramAssocMeasures classes such that the arguments of association
|
||||
measures are contingency table values rather than marginals.
|
||||
"""
|
||||
|
||||
def __init__(self, measures):
|
||||
"""Constructs a ContingencyMeasures given a NgramAssocMeasures class"""
|
||||
self.__class__.__name__ = "Contingency" + measures.__class__.__name__
|
||||
for k in dir(measures):
|
||||
if k.startswith("__"):
|
||||
continue
|
||||
v = getattr(measures, k)
|
||||
if not k.startswith("_"):
|
||||
v = self._make_contingency_fn(measures, v)
|
||||
setattr(self, k, v)
|
||||
|
||||
@staticmethod
|
||||
def _make_contingency_fn(measures, old_fn):
|
||||
"""From an association measure function, produces a new function which
|
||||
accepts contingency table values as its arguments.
|
||||
"""
|
||||
|
||||
def res(*contingency):
|
||||
return old_fn(*measures._marginals(*contingency))
|
||||
|
||||
res.__doc__ = old_fn.__doc__
|
||||
res.__name__ = old_fn.__name__
|
||||
return res
|
||||
351
backend/venv/Lib/site-packages/nltk/metrics/confusionmatrix.py
Normal file
351
backend/venv/Lib/site-packages/nltk/metrics/confusionmatrix.py
Normal file
@@ -0,0 +1,351 @@
|
||||
# Natural Language Toolkit: Confusion Matrices
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Tom Aarsen <>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.probability import FreqDist
|
||||
|
||||
|
||||
class ConfusionMatrix:
|
||||
"""
|
||||
The confusion matrix between a list of reference values and a
|
||||
corresponding list of test values. Entry *[r,t]* of this
|
||||
matrix is a count of the number of times that the reference value
|
||||
*r* corresponds to the test value *t*. E.g.:
|
||||
|
||||
>>> from nltk.metrics import ConfusionMatrix
|
||||
>>> ref = 'DET NN VB DET JJ NN NN IN DET NN'.split()
|
||||
>>> test = 'DET VB VB DET NN NN NN IN DET NN'.split()
|
||||
>>> cm = ConfusionMatrix(ref, test)
|
||||
>>> print(cm['NN', 'NN'])
|
||||
3
|
||||
|
||||
Note that the diagonal entries *Ri=Tj* of this matrix
|
||||
corresponds to correct values; and the off-diagonal entries
|
||||
correspond to incorrect values.
|
||||
"""
|
||||
|
||||
def __init__(self, reference, test, sort_by_count=False):
|
||||
"""
|
||||
Construct a new confusion matrix from a list of reference
|
||||
values and a corresponding list of test values.
|
||||
|
||||
:type reference: list
|
||||
:param reference: An ordered list of reference values.
|
||||
:type test: list
|
||||
:param test: A list of values to compare against the
|
||||
corresponding reference values.
|
||||
:raise ValueError: If ``reference`` and ``length`` do not have
|
||||
the same length.
|
||||
"""
|
||||
if len(reference) != len(test):
|
||||
raise ValueError("Lists must have the same length.")
|
||||
|
||||
# Get a list of all values.
|
||||
if sort_by_count:
|
||||
ref_fdist = FreqDist(reference)
|
||||
test_fdist = FreqDist(test)
|
||||
|
||||
def key(v):
|
||||
return -(ref_fdist[v] + test_fdist[v])
|
||||
|
||||
values = sorted(set(reference + test), key=key)
|
||||
else:
|
||||
values = sorted(set(reference + test))
|
||||
|
||||
# Construct a value->index dictionary
|
||||
indices = {val: i for (i, val) in enumerate(values)}
|
||||
|
||||
# Make a confusion matrix table.
|
||||
confusion = [[0 for _ in values] for _ in values]
|
||||
max_conf = 0 # Maximum confusion
|
||||
for w, g in zip(reference, test):
|
||||
confusion[indices[w]][indices[g]] += 1
|
||||
max_conf = max(max_conf, confusion[indices[w]][indices[g]])
|
||||
|
||||
#: A list of all values in ``reference`` or ``test``.
|
||||
self._values = values
|
||||
#: A dictionary mapping values in ``self._values`` to their indices.
|
||||
self._indices = indices
|
||||
#: The confusion matrix itself (as a list of lists of counts).
|
||||
self._confusion = confusion
|
||||
#: The greatest count in ``self._confusion`` (used for printing).
|
||||
self._max_conf = max_conf
|
||||
#: The total number of values in the confusion matrix.
|
||||
self._total = len(reference)
|
||||
#: The number of correct (on-diagonal) values in the matrix.
|
||||
self._correct = sum(confusion[i][i] for i in range(len(values)))
|
||||
|
||||
def __getitem__(self, li_lj_tuple):
|
||||
"""
|
||||
:return: The number of times that value ``li`` was expected and
|
||||
value ``lj`` was given.
|
||||
:rtype: int
|
||||
"""
|
||||
(li, lj) = li_lj_tuple
|
||||
i = self._indices[li]
|
||||
j = self._indices[lj]
|
||||
return self._confusion[i][j]
|
||||
|
||||
def __repr__(self):
|
||||
return f"<ConfusionMatrix: {self._correct}/{self._total} correct>"
|
||||
|
||||
def __str__(self):
|
||||
return self.pretty_format()
|
||||
|
||||
def pretty_format(
|
||||
self,
|
||||
show_percents=False,
|
||||
values_in_chart=True,
|
||||
truncate=None,
|
||||
sort_by_count=False,
|
||||
):
|
||||
"""
|
||||
:return: A multi-line string representation of this confusion matrix.
|
||||
:type truncate: int
|
||||
:param truncate: If specified, then only show the specified
|
||||
number of values. Any sorting (e.g., sort_by_count)
|
||||
will be performed before truncation.
|
||||
:param sort_by_count: If true, then sort by the count of each
|
||||
label in the reference data. I.e., labels that occur more
|
||||
frequently in the reference label will be towards the left
|
||||
edge of the matrix, and labels that occur less frequently
|
||||
will be towards the right edge.
|
||||
|
||||
@todo: add marginals?
|
||||
"""
|
||||
confusion = self._confusion
|
||||
|
||||
values = self._values
|
||||
if sort_by_count:
|
||||
values = sorted(
|
||||
values, key=lambda v: -sum(self._confusion[self._indices[v]])
|
||||
)
|
||||
|
||||
if truncate:
|
||||
values = values[:truncate]
|
||||
|
||||
if values_in_chart:
|
||||
value_strings = ["%s" % val for val in values]
|
||||
else:
|
||||
value_strings = [str(n + 1) for n in range(len(values))]
|
||||
|
||||
# Construct a format string for row values
|
||||
valuelen = max(len(val) for val in value_strings)
|
||||
value_format = "%" + repr(valuelen) + "s | "
|
||||
# Construct a format string for matrix entries
|
||||
if show_percents:
|
||||
entrylen = 6
|
||||
entry_format = "%5.1f%%"
|
||||
zerostr = " ."
|
||||
else:
|
||||
entrylen = len(repr(self._max_conf))
|
||||
entry_format = "%" + repr(entrylen) + "d"
|
||||
zerostr = " " * (entrylen - 1) + "."
|
||||
|
||||
# Write the column values.
|
||||
s = ""
|
||||
for i in range(valuelen):
|
||||
s += (" " * valuelen) + " |"
|
||||
for val in value_strings:
|
||||
if i >= valuelen - len(val):
|
||||
s += val[i - valuelen + len(val)].rjust(entrylen + 1)
|
||||
else:
|
||||
s += " " * (entrylen + 1)
|
||||
s += " |\n"
|
||||
|
||||
# Write a dividing line
|
||||
s += "{}-+-{}+\n".format("-" * valuelen, "-" * ((entrylen + 1) * len(values)))
|
||||
|
||||
# Write the entries.
|
||||
for val, li in zip(value_strings, values):
|
||||
i = self._indices[li]
|
||||
s += value_format % val
|
||||
for lj in values:
|
||||
j = self._indices[lj]
|
||||
if confusion[i][j] == 0:
|
||||
s += zerostr
|
||||
elif show_percents:
|
||||
s += entry_format % (100.0 * confusion[i][j] / self._total)
|
||||
else:
|
||||
s += entry_format % confusion[i][j]
|
||||
if i == j:
|
||||
prevspace = s.rfind(" ")
|
||||
s = s[:prevspace] + "<" + s[prevspace + 1 :] + ">"
|
||||
else:
|
||||
s += " "
|
||||
s += "|\n"
|
||||
|
||||
# Write a dividing line
|
||||
s += "{}-+-{}+\n".format("-" * valuelen, "-" * ((entrylen + 1) * len(values)))
|
||||
|
||||
# Write a key
|
||||
s += "(row = reference; col = test)\n"
|
||||
if not values_in_chart:
|
||||
s += "Value key:\n"
|
||||
for i, value in enumerate(values):
|
||||
s += "%6d: %s\n" % (i + 1, value)
|
||||
|
||||
return s
|
||||
|
||||
def key(self):
|
||||
values = self._values
|
||||
str = "Value key:\n"
|
||||
indexlen = len(repr(len(values) - 1))
|
||||
key_format = " %" + repr(indexlen) + "d: %s\n"
|
||||
str += "".join([key_format % (i, values[i]) for i in range(len(values))])
|
||||
return str
|
||||
|
||||
def recall(self, value):
|
||||
"""Given a value in the confusion matrix, return the recall
|
||||
that corresponds to this value. The recall is defined as:
|
||||
|
||||
- *r* = true positive / (true positive + false positive)
|
||||
|
||||
and can loosely be considered the ratio of how often ``value``
|
||||
was predicted correctly relative to how often ``value`` was
|
||||
the true result.
|
||||
|
||||
:param value: value used in the ConfusionMatrix
|
||||
:return: the recall corresponding to ``value``.
|
||||
:rtype: float
|
||||
"""
|
||||
# Number of times `value` was correct, and also predicted
|
||||
TP = self[value, value]
|
||||
# Number of times `value` was correct
|
||||
TP_FN = sum(self[value, pred_value] for pred_value in self._values)
|
||||
if TP_FN == 0:
|
||||
return 0.0
|
||||
return TP / TP_FN
|
||||
|
||||
def precision(self, value):
|
||||
"""Given a value in the confusion matrix, return the precision
|
||||
that corresponds to this value. The precision is defined as:
|
||||
|
||||
- *p* = true positive / (true positive + false negative)
|
||||
|
||||
and can loosely be considered the ratio of how often ``value``
|
||||
was predicted correctly relative to the number of predictions
|
||||
for ``value``.
|
||||
|
||||
:param value: value used in the ConfusionMatrix
|
||||
:return: the precision corresponding to ``value``.
|
||||
:rtype: float
|
||||
"""
|
||||
# Number of times `value` was correct, and also predicted
|
||||
TP = self[value, value]
|
||||
# Number of times `value` was predicted
|
||||
TP_FP = sum(self[real_value, value] for real_value in self._values)
|
||||
if TP_FP == 0:
|
||||
return 0.0
|
||||
return TP / TP_FP
|
||||
|
||||
def f_measure(self, value, alpha=0.5):
|
||||
"""
|
||||
Given a value used in the confusion matrix, return the f-measure
|
||||
that corresponds to this value. The f-measure is the harmonic mean
|
||||
of the ``precision`` and ``recall``, weighted by ``alpha``.
|
||||
In particular, given the precision *p* and recall *r* defined by:
|
||||
|
||||
- *p* = true positive / (true positive + false negative)
|
||||
- *r* = true positive / (true positive + false positive)
|
||||
|
||||
The f-measure is:
|
||||
|
||||
- *1/(alpha/p + (1-alpha)/r)*
|
||||
|
||||
With ``alpha = 0.5``, this reduces to:
|
||||
|
||||
- *2pr / (p + r)*
|
||||
|
||||
:param value: value used in the ConfusionMatrix
|
||||
:param alpha: Ratio of the cost of false negative compared to false
|
||||
positives. Defaults to 0.5, where the costs are equal.
|
||||
:type alpha: float
|
||||
:return: the F-measure corresponding to ``value``.
|
||||
:rtype: float
|
||||
"""
|
||||
p = self.precision(value)
|
||||
r = self.recall(value)
|
||||
if p == 0.0 or r == 0.0:
|
||||
return 0.0
|
||||
return 1.0 / (alpha / p + (1 - alpha) / r)
|
||||
|
||||
def evaluate(self, alpha=0.5, truncate=None, sort_by_count=False):
|
||||
"""
|
||||
Tabulate the **recall**, **precision** and **f-measure**
|
||||
for each value in this confusion matrix.
|
||||
|
||||
>>> reference = "DET NN VB DET JJ NN NN IN DET NN".split()
|
||||
>>> test = "DET VB VB DET NN NN NN IN DET NN".split()
|
||||
>>> cm = ConfusionMatrix(reference, test)
|
||||
>>> print(cm.evaluate())
|
||||
Tag | Prec. | Recall | F-measure
|
||||
----+--------+--------+-----------
|
||||
DET | 1.0000 | 1.0000 | 1.0000
|
||||
IN | 1.0000 | 1.0000 | 1.0000
|
||||
JJ | 0.0000 | 0.0000 | 0.0000
|
||||
NN | 0.7500 | 0.7500 | 0.7500
|
||||
VB | 0.5000 | 1.0000 | 0.6667
|
||||
<BLANKLINE>
|
||||
|
||||
:param alpha: Ratio of the cost of false negative compared to false
|
||||
positives, as used in the f-measure computation. Defaults to 0.5,
|
||||
where the costs are equal.
|
||||
:type alpha: float
|
||||
:param truncate: If specified, then only show the specified
|
||||
number of values. Any sorting (e.g., sort_by_count)
|
||||
will be performed before truncation. Defaults to None
|
||||
:type truncate: int, optional
|
||||
:param sort_by_count: Whether to sort the outputs on frequency
|
||||
in the reference label. Defaults to False.
|
||||
:type sort_by_count: bool, optional
|
||||
:return: A tabulated recall, precision and f-measure string
|
||||
:rtype: str
|
||||
"""
|
||||
tags = self._values
|
||||
|
||||
# Apply keyword parameters
|
||||
if sort_by_count:
|
||||
tags = sorted(tags, key=lambda v: -sum(self._confusion[self._indices[v]]))
|
||||
if truncate:
|
||||
tags = tags[:truncate]
|
||||
|
||||
tag_column_len = max(max(len(tag) for tag in tags), 3)
|
||||
|
||||
# Construct the header
|
||||
s = (
|
||||
f"{' ' * (tag_column_len - 3)}Tag | Prec. | Recall | F-measure\n"
|
||||
f"{'-' * tag_column_len}-+--------+--------+-----------\n"
|
||||
)
|
||||
|
||||
# Construct the body
|
||||
for tag in tags:
|
||||
s += (
|
||||
f"{tag:>{tag_column_len}} | "
|
||||
f"{self.precision(tag):<6.4f} | "
|
||||
f"{self.recall(tag):<6.4f} | "
|
||||
f"{self.f_measure(tag, alpha=alpha):.4f}\n"
|
||||
)
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def demo():
|
||||
reference = "DET NN VB DET JJ NN NN IN DET NN".split()
|
||||
test = "DET VB VB DET NN NN NN IN DET NN".split()
|
||||
print("Reference =", reference)
|
||||
print("Test =", test)
|
||||
print("Confusion matrix:")
|
||||
print(ConfusionMatrix(reference, test))
|
||||
print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True))
|
||||
|
||||
print(ConfusionMatrix(reference, test).recall("VB"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
508
backend/venv/Lib/site-packages/nltk/metrics/distance.py
Normal file
508
backend/venv/Lib/site-packages/nltk/metrics/distance.py
Normal file
@@ -0,0 +1,508 @@
|
||||
# Natural Language Toolkit: Distance Metrics
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Tom Lippincott <tom@cs.columbia.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
"""
|
||||
Distance Metrics.
|
||||
|
||||
Compute the distance between two items (usually strings).
|
||||
As metrics, they must satisfy the following three requirements:
|
||||
|
||||
1. d(a, a) = 0
|
||||
2. d(a, b) >= 0
|
||||
3. d(a, c) <= d(a, b) + d(b, c)
|
||||
"""
|
||||
|
||||
import operator
|
||||
import warnings
|
||||
|
||||
|
||||
def _edit_dist_init(len1, len2):
|
||||
lev = []
|
||||
for i in range(len1):
|
||||
lev.append([0] * len2) # initialize 2D array to zero
|
||||
for i in range(len1):
|
||||
lev[i][0] = i # column 0: 0,1,2,3,4,...
|
||||
for j in range(len2):
|
||||
lev[0][j] = j # row 0: 0,1,2,3,4,...
|
||||
return lev
|
||||
|
||||
|
||||
def _last_left_t_init(sigma):
|
||||
return {c: 0 for c in sigma}
|
||||
|
||||
|
||||
def _edit_dist_step(
|
||||
lev, i, j, s1, s2, last_left, last_right, substitution_cost=1, transpositions=False
|
||||
):
|
||||
c1 = s1[i - 1]
|
||||
c2 = s2[j - 1]
|
||||
|
||||
# skipping a character in s1
|
||||
a = lev[i - 1][j] + 1
|
||||
# skipping a character in s2
|
||||
b = lev[i][j - 1] + 1
|
||||
# substitution
|
||||
c = lev[i - 1][j - 1] + (substitution_cost if c1 != c2 else 0)
|
||||
|
||||
# transposition
|
||||
d = c + 1 # never picked by default
|
||||
if transpositions and last_left > 0 and last_right > 0:
|
||||
d = lev[last_left - 1][last_right - 1] + i - last_left + j - last_right - 1
|
||||
|
||||
# pick the cheapest
|
||||
lev[i][j] = min(a, b, c, d)
|
||||
|
||||
|
||||
def edit_distance(s1, s2, substitution_cost=1, transpositions=False):
|
||||
"""
|
||||
Calculate the Levenshtein edit-distance between two strings.
|
||||
The edit distance is the number of characters that need to be
|
||||
substituted, inserted, or deleted, to transform s1 into s2. For
|
||||
example, transforming "rain" to "shine" requires three steps,
|
||||
consisting of two substitutions and one insertion:
|
||||
"rain" -> "sain" -> "shin" -> "shine". These operations could have
|
||||
been done in other orders, but at least three steps are needed.
|
||||
|
||||
Allows specifying the cost of substitution edits (e.g., "a" -> "b"),
|
||||
because sometimes it makes sense to assign greater penalties to
|
||||
substitutions.
|
||||
|
||||
This also optionally allows transposition edits (e.g., "ab" -> "ba"),
|
||||
though this is disabled by default.
|
||||
|
||||
:param s1, s2: The strings to be analysed
|
||||
:param transpositions: Whether to allow transposition edits
|
||||
:type s1: str
|
||||
:type s2: str
|
||||
:type substitution_cost: int
|
||||
:type transpositions: bool
|
||||
:rtype: int
|
||||
"""
|
||||
# set up a 2-D array
|
||||
len1 = len(s1)
|
||||
len2 = len(s2)
|
||||
lev = _edit_dist_init(len1 + 1, len2 + 1)
|
||||
|
||||
# retrieve alphabet
|
||||
sigma = set()
|
||||
sigma.update(s1)
|
||||
sigma.update(s2)
|
||||
|
||||
# set up table to remember positions of last seen occurrence in s1
|
||||
last_left_t = _last_left_t_init(sigma)
|
||||
|
||||
# iterate over the array
|
||||
# i and j start from 1 and not 0 to stay close to the wikipedia pseudo-code
|
||||
# see https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
|
||||
for i in range(1, len1 + 1):
|
||||
last_right_buf = 0
|
||||
for j in range(1, len2 + 1):
|
||||
last_left = last_left_t[s2[j - 1]]
|
||||
last_right = last_right_buf
|
||||
if s1[i - 1] == s2[j - 1]:
|
||||
last_right_buf = j
|
||||
_edit_dist_step(
|
||||
lev,
|
||||
i,
|
||||
j,
|
||||
s1,
|
||||
s2,
|
||||
last_left,
|
||||
last_right,
|
||||
substitution_cost=substitution_cost,
|
||||
transpositions=transpositions,
|
||||
)
|
||||
last_left_t[s1[i - 1]] = i
|
||||
return lev[len1][len2]
|
||||
|
||||
|
||||
def _edit_dist_backtrace(lev):
|
||||
i, j = len(lev) - 1, len(lev[0]) - 1
|
||||
alignment = [(i, j)]
|
||||
|
||||
while (i, j) != (0, 0):
|
||||
directions = [
|
||||
(i - 1, j - 1), # substitution
|
||||
(i - 1, j), # skip s1
|
||||
(i, j - 1), # skip s2
|
||||
]
|
||||
|
||||
direction_costs = (
|
||||
(lev[i][j] if (i >= 0 and j >= 0) else float("inf"), (i, j))
|
||||
for i, j in directions
|
||||
)
|
||||
_, (i, j) = min(direction_costs, key=operator.itemgetter(0))
|
||||
|
||||
alignment.append((i, j))
|
||||
return list(reversed(alignment))
|
||||
|
||||
|
||||
def edit_distance_align(s1, s2, substitution_cost=1):
|
||||
"""
|
||||
Calculate the minimum Levenshtein edit-distance based alignment
|
||||
mapping between two strings. The alignment finds the mapping
|
||||
from string s1 to s2 that minimizes the edit distance cost.
|
||||
For example, mapping "rain" to "shine" would involve 2
|
||||
substitutions, 2 matches and an insertion resulting in
|
||||
the following mapping:
|
||||
[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5)]
|
||||
NB: (0, 0) is the start state without any letters associated
|
||||
See more: https://web.stanford.edu/class/cs124/lec/med.pdf
|
||||
|
||||
In case of multiple valid minimum-distance alignments, the
|
||||
backtrace has the following operation precedence:
|
||||
|
||||
1. Substitute s1 and s2 characters
|
||||
2. Skip s1 character
|
||||
3. Skip s2 character
|
||||
|
||||
The backtrace is carried out in reverse string order.
|
||||
|
||||
This function does not support transposition.
|
||||
|
||||
:param s1, s2: The strings to be aligned
|
||||
:type s1: str
|
||||
:type s2: str
|
||||
:type substitution_cost: int
|
||||
:rtype: List[Tuple(int, int)]
|
||||
"""
|
||||
# set up a 2-D array
|
||||
len1 = len(s1)
|
||||
len2 = len(s2)
|
||||
lev = _edit_dist_init(len1 + 1, len2 + 1)
|
||||
|
||||
# iterate over the array
|
||||
for i in range(len1):
|
||||
for j in range(len2):
|
||||
_edit_dist_step(
|
||||
lev,
|
||||
i + 1,
|
||||
j + 1,
|
||||
s1,
|
||||
s2,
|
||||
0,
|
||||
0,
|
||||
substitution_cost=substitution_cost,
|
||||
transpositions=False,
|
||||
)
|
||||
|
||||
# backtrace to find alignment
|
||||
alignment = _edit_dist_backtrace(lev)
|
||||
return alignment
|
||||
|
||||
|
||||
def binary_distance(label1, label2):
|
||||
"""Simple equality test.
|
||||
|
||||
0.0 if the labels are identical, 1.0 if they are different.
|
||||
|
||||
>>> from nltk.metrics import binary_distance
|
||||
>>> binary_distance(1,1)
|
||||
0.0
|
||||
|
||||
>>> binary_distance(1,3)
|
||||
1.0
|
||||
"""
|
||||
|
||||
return 0.0 if label1 == label2 else 1.0
|
||||
|
||||
|
||||
def jaccard_distance(label1, label2):
|
||||
"""Distance metric comparing set-similarity."""
|
||||
return (len(label1.union(label2)) - len(label1.intersection(label2))) / len(
|
||||
label1.union(label2)
|
||||
)
|
||||
|
||||
|
||||
def masi_distance(label1, label2):
|
||||
"""Distance metric that takes into account partial agreement when multiple
|
||||
labels are assigned.
|
||||
|
||||
>>> from nltk.metrics import masi_distance
|
||||
>>> masi_distance(set([1, 2]), set([1, 2, 3, 4]))
|
||||
0.665
|
||||
|
||||
Passonneau 2006, Measuring Agreement on Set-Valued Items (MASI)
|
||||
for Semantic and Pragmatic Annotation.
|
||||
"""
|
||||
|
||||
len_intersection = len(label1.intersection(label2))
|
||||
len_union = len(label1.union(label2))
|
||||
len_label1 = len(label1)
|
||||
len_label2 = len(label2)
|
||||
if len_label1 == len_label2 and len_label1 == len_intersection:
|
||||
m = 1
|
||||
elif len_intersection == min(len_label1, len_label2):
|
||||
m = 0.67
|
||||
elif len_intersection > 0:
|
||||
m = 0.33
|
||||
else:
|
||||
m = 0
|
||||
|
||||
return 1 - len_intersection / len_union * m
|
||||
|
||||
|
||||
def interval_distance(label1, label2):
|
||||
"""Krippendorff's interval distance metric
|
||||
|
||||
>>> from nltk.metrics import interval_distance
|
||||
>>> interval_distance(1,10)
|
||||
81
|
||||
|
||||
Krippendorff 1980, Content Analysis: An Introduction to its Methodology
|
||||
"""
|
||||
|
||||
try:
|
||||
return pow(label1 - label2, 2)
|
||||
# return pow(list(label1)[0]-list(label2)[0],2)
|
||||
except:
|
||||
print("non-numeric labels not supported with interval distance")
|
||||
|
||||
|
||||
def presence(label):
|
||||
"""Higher-order function to test presence of a given label"""
|
||||
|
||||
return lambda x, y: 1.0 * ((label in x) == (label in y))
|
||||
|
||||
|
||||
def fractional_presence(label):
|
||||
return (
|
||||
lambda x, y: abs((1.0 / len(x)) - (1.0 / len(y))) * (label in x and label in y)
|
||||
or 0.0 * (label not in x and label not in y)
|
||||
or abs(1.0 / len(x)) * (label in x and label not in y)
|
||||
or (1.0 / len(y)) * (label not in x and label in y)
|
||||
)
|
||||
|
||||
|
||||
def custom_distance(file):
|
||||
data = {}
|
||||
with open(file) as infile:
|
||||
for l in infile:
|
||||
labelA, labelB, dist = l.strip().split("\t")
|
||||
labelA = frozenset([labelA])
|
||||
labelB = frozenset([labelB])
|
||||
data[frozenset([labelA, labelB])] = float(dist)
|
||||
return lambda x, y: data[frozenset([x, y])]
|
||||
|
||||
|
||||
def jaro_similarity(s1, s2):
|
||||
"""
|
||||
Computes the Jaro similarity between 2 sequences from:
|
||||
|
||||
Matthew A. Jaro (1989). Advances in record linkage methodology
|
||||
as applied to the 1985 census of Tampa Florida. Journal of the
|
||||
American Statistical Association. 84 (406): 414-20.
|
||||
|
||||
The Jaro distance between is the min no. of single-character transpositions
|
||||
required to change one word into another. The Jaro similarity formula from
|
||||
https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance :
|
||||
|
||||
``jaro_sim = 0 if m = 0 else 1/3 * (m/|s_1| + m/s_2 + (m-t)/m)``
|
||||
|
||||
where
|
||||
- `|s_i|` is the length of string `s_i`
|
||||
- `m` is the no. of matching characters
|
||||
- `t` is the half no. of possible transpositions.
|
||||
"""
|
||||
# First, store the length of the strings
|
||||
# because they will be re-used several times.
|
||||
len_s1, len_s2 = len(s1), len(s2)
|
||||
|
||||
# The upper bound of the distance for being a matched character.
|
||||
match_bound = max(len_s1, len_s2) // 2 - 1
|
||||
|
||||
# Initialize the counts for matches and transpositions.
|
||||
matches = 0 # no.of matched characters in s1 and s2
|
||||
transpositions = 0 # no. of transpositions between s1 and s2
|
||||
flagged_1 = [] # positions in s1 which are matches to some character in s2
|
||||
flagged_2 = [] # positions in s2 which are matches to some character in s1
|
||||
|
||||
# Iterate through sequences, check for matches and compute transpositions.
|
||||
for i in range(len_s1): # Iterate through each character.
|
||||
upperbound = min(i + match_bound, len_s2 - 1)
|
||||
lowerbound = max(0, i - match_bound)
|
||||
for j in range(lowerbound, upperbound + 1):
|
||||
if s1[i] == s2[j] and j not in flagged_2:
|
||||
matches += 1
|
||||
flagged_1.append(i)
|
||||
flagged_2.append(j)
|
||||
break
|
||||
flagged_2.sort()
|
||||
for i, j in zip(flagged_1, flagged_2):
|
||||
if s1[i] != s2[j]:
|
||||
transpositions += 1
|
||||
|
||||
if matches == 0:
|
||||
return 0
|
||||
else:
|
||||
return (
|
||||
1
|
||||
/ 3
|
||||
* (
|
||||
matches / len_s1
|
||||
+ matches / len_s2
|
||||
+ (matches - transpositions // 2) / matches
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def jaro_winkler_similarity(s1, s2, p=0.1, max_l=4):
|
||||
"""
|
||||
The Jaro Winkler distance is an extension of the Jaro similarity in:
|
||||
|
||||
William E. Winkler. 1990. String Comparator Metrics and Enhanced
|
||||
Decision Rules in the Fellegi-Sunter Model of Record Linkage.
|
||||
Proceedings of the Section on Survey Research Methods.
|
||||
American Statistical Association: 354-359.
|
||||
|
||||
such that:
|
||||
|
||||
jaro_winkler_sim = jaro_sim + ( l * p * (1 - jaro_sim) )
|
||||
|
||||
where,
|
||||
|
||||
- jaro_sim is the output from the Jaro Similarity,
|
||||
see jaro_similarity()
|
||||
- l is the length of common prefix at the start of the string
|
||||
- this implementation provides an upperbound for the l value
|
||||
to keep the prefixes.A common value of this upperbound is 4.
|
||||
- p is the constant scaling factor to overweigh common prefixes.
|
||||
The Jaro-Winkler similarity will fall within the [0, 1] bound,
|
||||
given that max(p)<=0.25 , default is p=0.1 in Winkler (1990)
|
||||
|
||||
|
||||
Test using outputs from https://www.census.gov/srd/papers/pdf/rr93-8.pdf
|
||||
from "Table 5 Comparison of String Comparators Rescaled between 0 and 1"
|
||||
|
||||
>>> winkler_examples = [("billy", "billy"), ("billy", "bill"), ("billy", "blily"),
|
||||
... ("massie", "massey"), ("yvette", "yevett"), ("billy", "bolly"), ("dwayne", "duane"),
|
||||
... ("dixon", "dickson"), ("billy", "susan")]
|
||||
|
||||
>>> winkler_scores = [1.000, 0.967, 0.947, 0.944, 0.911, 0.893, 0.858, 0.853, 0.000]
|
||||
>>> jaro_scores = [1.000, 0.933, 0.933, 0.889, 0.889, 0.867, 0.822, 0.790, 0.000]
|
||||
|
||||
One way to match the values on the Winkler's paper is to provide a different
|
||||
p scaling factor for different pairs of strings, e.g.
|
||||
|
||||
>>> p_factors = [0.1, 0.125, 0.20, 0.125, 0.20, 0.20, 0.20, 0.15, 0.1]
|
||||
|
||||
>>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors):
|
||||
... assert round(jaro_similarity(s1, s2), 3) == jscore
|
||||
... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore
|
||||
|
||||
|
||||
Test using outputs from https://www.census.gov/srd/papers/pdf/rr94-5.pdf from
|
||||
"Table 2.1. Comparison of String Comparators Using Last Names, First Names, and Street Names"
|
||||
|
||||
>>> winkler_examples = [('SHACKLEFORD', 'SHACKELFORD'), ('DUNNINGHAM', 'CUNNIGHAM'),
|
||||
... ('NICHLESON', 'NICHULSON'), ('JONES', 'JOHNSON'), ('MASSEY', 'MASSIE'),
|
||||
... ('ABROMS', 'ABRAMS'), ('HARDIN', 'MARTINEZ'), ('ITMAN', 'SMITH'),
|
||||
... ('JERALDINE', 'GERALDINE'), ('MARHTA', 'MARTHA'), ('MICHELLE', 'MICHAEL'),
|
||||
... ('JULIES', 'JULIUS'), ('TANYA', 'TONYA'), ('DWAYNE', 'DUANE'), ('SEAN', 'SUSAN'),
|
||||
... ('JON', 'JOHN'), ('JON', 'JAN'), ('BROOKHAVEN', 'BRROKHAVEN'),
|
||||
... ('BROOK HALLOW', 'BROOK HLLW'), ('DECATUR', 'DECATIR'), ('FITZRUREITER', 'FITZENREITER'),
|
||||
... ('HIGBEE', 'HIGHEE'), ('HIGBEE', 'HIGVEE'), ('LACURA', 'LOCURA'), ('IOWA', 'IONA'), ('1ST', 'IST')]
|
||||
|
||||
>>> jaro_scores = [0.970, 0.896, 0.926, 0.790, 0.889, 0.889, 0.722, 0.467, 0.926,
|
||||
... 0.944, 0.869, 0.889, 0.867, 0.822, 0.783, 0.917, 0.000, 0.933, 0.944, 0.905,
|
||||
... 0.856, 0.889, 0.889, 0.889, 0.833, 0.000]
|
||||
|
||||
>>> winkler_scores = [0.982, 0.896, 0.956, 0.832, 0.944, 0.922, 0.722, 0.467, 0.926,
|
||||
... 0.961, 0.921, 0.933, 0.880, 0.858, 0.805, 0.933, 0.000, 0.947, 0.967, 0.943,
|
||||
... 0.913, 0.922, 0.922, 0.900, 0.867, 0.000]
|
||||
|
||||
One way to match the values on the Winkler's paper is to provide a different
|
||||
p scaling factor for different pairs of strings, e.g.
|
||||
|
||||
>>> p_factors = [0.1, 0.1, 0.1, 0.1, 0.125, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.20,
|
||||
... 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
|
||||
|
||||
|
||||
>>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors):
|
||||
... if (s1, s2) in [('JON', 'JAN'), ('1ST', 'IST')]:
|
||||
... continue # Skip bad examples from the paper.
|
||||
... assert round(jaro_similarity(s1, s2), 3) == jscore
|
||||
... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore
|
||||
|
||||
|
||||
|
||||
This test-case proves that the output of Jaro-Winkler similarity depends on
|
||||
the product l * p and not on the product max_l * p. Here the product max_l * p > 1
|
||||
however the product l * p <= 1
|
||||
|
||||
>>> round(jaro_winkler_similarity('TANYA', 'TONYA', p=0.1, max_l=100), 3)
|
||||
0.88
|
||||
"""
|
||||
# To ensure that the output of the Jaro-Winkler's similarity
|
||||
# falls between [0,1], the product of l * p needs to be
|
||||
# also fall between [0,1].
|
||||
if not 0 <= max_l * p <= 1:
|
||||
warnings.warn(
|
||||
str(
|
||||
"The product `max_l * p` might not fall between [0,1]."
|
||||
"Jaro-Winkler similarity might not be between 0 and 1."
|
||||
)
|
||||
)
|
||||
|
||||
# Compute the Jaro similarity
|
||||
jaro_sim = jaro_similarity(s1, s2)
|
||||
|
||||
# Initialize the upper bound for the no. of prefixes.
|
||||
# if user did not pre-define the upperbound,
|
||||
# use shorter length between s1 and s2
|
||||
|
||||
# Compute the prefix matches.
|
||||
l = 0
|
||||
# zip() will automatically loop until the end of shorter string.
|
||||
for s1_i, s2_i in zip(s1, s2):
|
||||
if s1_i == s2_i:
|
||||
l += 1
|
||||
else:
|
||||
break
|
||||
if l == max_l:
|
||||
break
|
||||
# Return the similarity value as described in docstring.
|
||||
return jaro_sim + (l * p * (1 - jaro_sim))
|
||||
|
||||
|
||||
def demo():
|
||||
string_distance_examples = [
|
||||
("rain", "shine"),
|
||||
("abcdef", "acbdef"),
|
||||
("language", "lnaguaeg"),
|
||||
("language", "lnaugage"),
|
||||
("language", "lngauage"),
|
||||
]
|
||||
for s1, s2 in string_distance_examples:
|
||||
print(f"Edit distance btwn '{s1}' and '{s2}':", edit_distance(s1, s2))
|
||||
print(
|
||||
f"Edit dist with transpositions btwn '{s1}' and '{s2}':",
|
||||
edit_distance(s1, s2, transpositions=True),
|
||||
)
|
||||
print(f"Jaro similarity btwn '{s1}' and '{s2}':", jaro_similarity(s1, s2))
|
||||
print(
|
||||
f"Jaro-Winkler similarity btwn '{s1}' and '{s2}':",
|
||||
jaro_winkler_similarity(s1, s2),
|
||||
)
|
||||
print(
|
||||
f"Jaro-Winkler distance btwn '{s1}' and '{s2}':",
|
||||
1 - jaro_winkler_similarity(s1, s2),
|
||||
)
|
||||
s1 = {1, 2, 3, 4}
|
||||
s2 = {3, 4, 5}
|
||||
print("s1:", s1)
|
||||
print("s2:", s2)
|
||||
print("Binary distance:", binary_distance(s1, s2))
|
||||
print("Jaccard distance:", jaccard_distance(s1, s2))
|
||||
print("MASI distance:", masi_distance(s1, s2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
389
backend/venv/Lib/site-packages/nltk/metrics/paice.py
Normal file
389
backend/venv/Lib/site-packages/nltk/metrics/paice.py
Normal file
@@ -0,0 +1,389 @@
|
||||
# Natural Language Toolkit: Agreement Metrics
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Lauri Hallila <laurihallila@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
"""Counts Paice's performance statistics for evaluating stemming algorithms.
|
||||
|
||||
What is required:
|
||||
- A dictionary of words grouped by their real lemmas
|
||||
- A dictionary of words grouped by stems from a stemming algorithm
|
||||
|
||||
When these are given, Understemming Index (UI), Overstemming Index (OI),
|
||||
Stemming Weight (SW) and Error-rate relative to truncation (ERRT) are counted.
|
||||
|
||||
References:
|
||||
Chris D. Paice (1994). An evaluation method for stemming algorithms.
|
||||
In Proceedings of SIGIR, 42--50.
|
||||
"""
|
||||
|
||||
from math import sqrt
|
||||
|
||||
|
||||
def get_words_from_dictionary(lemmas):
|
||||
"""
|
||||
Get original set of words used for analysis.
|
||||
|
||||
:param lemmas: A dictionary where keys are lemmas and values are sets
|
||||
or lists of words corresponding to that lemma.
|
||||
:type lemmas: dict(str): list(str)
|
||||
:return: Set of words that exist as values in the dictionary
|
||||
:rtype: set(str)
|
||||
"""
|
||||
words = set()
|
||||
for lemma in lemmas:
|
||||
words.update(set(lemmas[lemma]))
|
||||
return words
|
||||
|
||||
|
||||
def _truncate(words, cutlength):
|
||||
"""Group words by stems defined by truncating them at given length.
|
||||
|
||||
:param words: Set of words used for analysis
|
||||
:param cutlength: Words are stemmed by cutting at this length.
|
||||
:type words: set(str) or list(str)
|
||||
:type cutlength: int
|
||||
:return: Dictionary where keys are stems and values are sets of words
|
||||
corresponding to that stem.
|
||||
:rtype: dict(str): set(str)
|
||||
"""
|
||||
stems = {}
|
||||
for word in words:
|
||||
stem = word[:cutlength]
|
||||
try:
|
||||
stems[stem].update([word])
|
||||
except KeyError:
|
||||
stems[stem] = {word}
|
||||
return stems
|
||||
|
||||
|
||||
# Reference: https://en.wikipedia.org/wiki/Line-line_intersection
|
||||
def _count_intersection(l1, l2):
|
||||
"""Count intersection between two line segments defined by coordinate pairs.
|
||||
|
||||
:param l1: Tuple of two coordinate pairs defining the first line segment
|
||||
:param l2: Tuple of two coordinate pairs defining the second line segment
|
||||
:type l1: tuple(float, float)
|
||||
:type l2: tuple(float, float)
|
||||
:return: Coordinates of the intersection
|
||||
:rtype: tuple(float, float)
|
||||
"""
|
||||
x1, y1 = l1[0]
|
||||
x2, y2 = l1[1]
|
||||
x3, y3 = l2[0]
|
||||
x4, y4 = l2[1]
|
||||
|
||||
denominator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
|
||||
|
||||
if denominator == 0.0: # lines are parallel
|
||||
if x1 == x2 == x3 == x4 == 0.0:
|
||||
# When lines are parallel, they must be on the y-axis.
|
||||
# We can ignore x-axis because we stop counting the
|
||||
# truncation line when we get there.
|
||||
# There are no other options as UI (x-axis) grows and
|
||||
# OI (y-axis) diminishes when we go along the truncation line.
|
||||
return (0.0, y4)
|
||||
|
||||
x = (
|
||||
(x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)
|
||||
) / denominator
|
||||
y = (
|
||||
(x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4)
|
||||
) / denominator
|
||||
return (x, y)
|
||||
|
||||
|
||||
def _get_derivative(coordinates):
|
||||
"""Get derivative of the line from (0,0) to given coordinates.
|
||||
|
||||
:param coordinates: A coordinate pair
|
||||
:type coordinates: tuple(float, float)
|
||||
:return: Derivative; inf if x is zero
|
||||
:rtype: float
|
||||
"""
|
||||
try:
|
||||
return coordinates[1] / coordinates[0]
|
||||
except ZeroDivisionError:
|
||||
return float("inf")
|
||||
|
||||
|
||||
def _calculate_cut(lemmawords, stems):
|
||||
"""Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.
|
||||
|
||||
:param lemmawords: Set or list of words corresponding to certain lemma.
|
||||
:param stems: A dictionary where keys are stems and values are sets
|
||||
or lists of words corresponding to that stem.
|
||||
:type lemmawords: set(str) or list(str)
|
||||
:type stems: dict(str): set(str)
|
||||
:return: Amount of understemmed and overstemmed pairs contributed by words
|
||||
existing in both lemmawords and stems.
|
||||
:rtype: tuple(float, float)
|
||||
"""
|
||||
umt, wmt = 0.0, 0.0
|
||||
for stem in stems:
|
||||
cut = set(lemmawords) & set(stems[stem])
|
||||
if cut:
|
||||
cutcount = len(cut)
|
||||
stemcount = len(stems[stem])
|
||||
# Unachieved merge total
|
||||
umt += cutcount * (len(lemmawords) - cutcount)
|
||||
# Wrongly merged total
|
||||
wmt += cutcount * (stemcount - cutcount)
|
||||
return (umt, wmt)
|
||||
|
||||
|
||||
def _calculate(lemmas, stems):
|
||||
"""Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.
|
||||
|
||||
:param lemmas: A dictionary where keys are lemmas and values are sets
|
||||
or lists of words corresponding to that lemma.
|
||||
:param stems: A dictionary where keys are stems and values are sets
|
||||
or lists of words corresponding to that stem.
|
||||
:type lemmas: dict(str): list(str)
|
||||
:type stems: dict(str): set(str)
|
||||
:return: Global unachieved merge total (gumt),
|
||||
global desired merge total (gdmt),
|
||||
global wrongly merged total (gwmt) and
|
||||
global desired non-merge total (gdnt).
|
||||
:rtype: tuple(float, float, float, float)
|
||||
"""
|
||||
|
||||
n = sum(len(lemmas[word]) for word in lemmas)
|
||||
|
||||
gdmt, gdnt, gumt, gwmt = (0.0, 0.0, 0.0, 0.0)
|
||||
|
||||
for lemma in lemmas:
|
||||
lemmacount = len(lemmas[lemma])
|
||||
|
||||
# Desired merge total
|
||||
gdmt += lemmacount * (lemmacount - 1)
|
||||
|
||||
# Desired non-merge total
|
||||
gdnt += lemmacount * (n - lemmacount)
|
||||
|
||||
# For each (lemma, stem) pair with common words, count how many
|
||||
# pairs are understemmed and overstemmed.
|
||||
umt, wmt = _calculate_cut(lemmas[lemma], stems)
|
||||
|
||||
# Add to total undesired and wrongly-merged totals
|
||||
gumt += umt
|
||||
gwmt += wmt
|
||||
|
||||
# Each object is counted twice, so divide by two
|
||||
return (gumt / 2, gdmt / 2, gwmt / 2, gdnt / 2)
|
||||
|
||||
|
||||
def _indexes(gumt, gdmt, gwmt, gdnt):
|
||||
"""Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).
|
||||
|
||||
:param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt),
|
||||
global desired merge total (gdmt),
|
||||
global wrongly merged total (gwmt) and
|
||||
global desired non-merge total (gdnt).
|
||||
:type gumt, gdmt, gwmt, gdnt: float
|
||||
:return: Understemming Index (UI),
|
||||
Overstemming Index (OI) and
|
||||
Stemming Weight (SW).
|
||||
:rtype: tuple(float, float, float)
|
||||
"""
|
||||
# Calculate Understemming Index (UI),
|
||||
# Overstemming Index (OI) and Stemming Weight (SW)
|
||||
try:
|
||||
ui = gumt / gdmt
|
||||
except ZeroDivisionError:
|
||||
# If GDMT (max merge total) is 0, define UI as 0
|
||||
ui = 0.0
|
||||
try:
|
||||
oi = gwmt / gdnt
|
||||
except ZeroDivisionError:
|
||||
# IF GDNT (max non-merge total) is 0, define OI as 0
|
||||
oi = 0.0
|
||||
try:
|
||||
sw = oi / ui
|
||||
except ZeroDivisionError:
|
||||
if oi == 0.0:
|
||||
# OI and UI are 0, define SW as 'not a number'
|
||||
sw = float("nan")
|
||||
else:
|
||||
# UI is 0, define SW as infinity
|
||||
sw = float("inf")
|
||||
return (ui, oi, sw)
|
||||
|
||||
|
||||
class Paice:
|
||||
"""Class for storing lemmas, stems and evaluation metrics."""
|
||||
|
||||
def __init__(self, lemmas, stems):
|
||||
"""
|
||||
:param lemmas: A dictionary where keys are lemmas and values are sets
|
||||
or lists of words corresponding to that lemma.
|
||||
:param stems: A dictionary where keys are stems and values are sets
|
||||
or lists of words corresponding to that stem.
|
||||
:type lemmas: dict(str): list(str)
|
||||
:type stems: dict(str): set(str)
|
||||
"""
|
||||
self.lemmas = lemmas
|
||||
self.stems = stems
|
||||
self.coords = []
|
||||
self.gumt, self.gdmt, self.gwmt, self.gdnt = (None, None, None, None)
|
||||
self.ui, self.oi, self.sw = (None, None, None)
|
||||
self.errt = None
|
||||
self.update()
|
||||
|
||||
def __str__(self):
|
||||
text = ["Global Unachieved Merge Total (GUMT): %s\n" % self.gumt]
|
||||
text.append("Global Desired Merge Total (GDMT): %s\n" % self.gdmt)
|
||||
text.append("Global Wrongly-Merged Total (GWMT): %s\n" % self.gwmt)
|
||||
text.append("Global Desired Non-merge Total (GDNT): %s\n" % self.gdnt)
|
||||
text.append("Understemming Index (GUMT / GDMT): %s\n" % self.ui)
|
||||
text.append("Overstemming Index (GWMT / GDNT): %s\n" % self.oi)
|
||||
text.append("Stemming Weight (OI / UI): %s\n" % self.sw)
|
||||
text.append("Error-Rate Relative to Truncation (ERRT): %s\r\n" % self.errt)
|
||||
coordinates = " ".join(["(%s, %s)" % item for item in self.coords])
|
||||
text.append("Truncation line: %s" % coordinates)
|
||||
return "".join(text)
|
||||
|
||||
def _get_truncation_indexes(self, words, cutlength):
|
||||
"""Count (UI, OI) when stemming is done by truncating words at \'cutlength\'.
|
||||
|
||||
:param words: Words used for the analysis
|
||||
:param cutlength: Words are stemmed by cutting them at this length
|
||||
:type words: set(str) or list(str)
|
||||
:type cutlength: int
|
||||
:return: Understemming and overstemming indexes
|
||||
:rtype: tuple(int, int)
|
||||
"""
|
||||
|
||||
truncated = _truncate(words, cutlength)
|
||||
gumt, gdmt, gwmt, gdnt = _calculate(self.lemmas, truncated)
|
||||
ui, oi = _indexes(gumt, gdmt, gwmt, gdnt)[:2]
|
||||
return (ui, oi)
|
||||
|
||||
def _get_truncation_coordinates(self, cutlength=0):
|
||||
"""Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.
|
||||
|
||||
:param cutlength: Optional parameter to start counting from (ui, oi)
|
||||
coordinates gotten by stemming at this length. Useful for speeding up
|
||||
the calculations when you know the approximate location of the
|
||||
intersection.
|
||||
:type cutlength: int
|
||||
:return: List of coordinate pairs that define the truncation line
|
||||
:rtype: list(tuple(float, float))
|
||||
"""
|
||||
words = get_words_from_dictionary(self.lemmas)
|
||||
maxlength = max(len(word) for word in words)
|
||||
|
||||
# Truncate words from different points until (0, 0) - (ui, oi) segment crosses the truncation line
|
||||
coords = []
|
||||
while cutlength <= maxlength:
|
||||
# Get (UI, OI) pair of current truncation point
|
||||
pair = self._get_truncation_indexes(words, cutlength)
|
||||
|
||||
# Store only new coordinates so we'll have an actual
|
||||
# line segment when counting the intersection point
|
||||
if pair not in coords:
|
||||
coords.append(pair)
|
||||
if pair == (0.0, 0.0):
|
||||
# Stop counting if truncation line goes through origo;
|
||||
# length from origo to truncation line is 0
|
||||
return coords
|
||||
if len(coords) >= 2 and pair[0] > 0.0:
|
||||
derivative1 = _get_derivative(coords[-2])
|
||||
derivative2 = _get_derivative(coords[-1])
|
||||
# Derivative of the truncation line is a decreasing value;
|
||||
# when it passes Stemming Weight, we've found the segment
|
||||
# of truncation line intersecting with (0, 0) - (ui, oi) segment
|
||||
if derivative1 >= self.sw >= derivative2:
|
||||
return coords
|
||||
cutlength += 1
|
||||
return coords
|
||||
|
||||
def _errt(self):
|
||||
"""Count Error-Rate Relative to Truncation (ERRT).
|
||||
|
||||
:return: ERRT, length of the line from origo to (UI, OI) divided by
|
||||
the length of the line from origo to the point defined by the same
|
||||
line when extended until the truncation line.
|
||||
:rtype: float
|
||||
"""
|
||||
# Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line
|
||||
self.coords = self._get_truncation_coordinates()
|
||||
if (0.0, 0.0) in self.coords:
|
||||
# Truncation line goes through origo, so ERRT cannot be counted
|
||||
if (self.ui, self.oi) != (0.0, 0.0):
|
||||
return float("inf")
|
||||
else:
|
||||
return float("nan")
|
||||
if (self.ui, self.oi) == (0.0, 0.0):
|
||||
# (ui, oi) is origo; define errt as 0.0
|
||||
return 0.0
|
||||
# Count the intersection point
|
||||
# Note that (self.ui, self.oi) cannot be (0.0, 0.0) and self.coords has different coordinates
|
||||
# so we have actual line segments instead of a line segment and a point
|
||||
intersection = _count_intersection(
|
||||
((0, 0), (self.ui, self.oi)), self.coords[-2:]
|
||||
)
|
||||
# Count OP (length of the line from origo to (ui, oi))
|
||||
op = sqrt(self.ui**2 + self.oi**2)
|
||||
# Count OT (length of the line from origo to truncation line that goes through (ui, oi))
|
||||
ot = sqrt(intersection[0] ** 2 + intersection[1] ** 2)
|
||||
# OP / OT tells how well the stemming algorithm works compared to just truncating words
|
||||
return op / ot
|
||||
|
||||
def update(self):
|
||||
"""Update statistics after lemmas and stems have been set."""
|
||||
self.gumt, self.gdmt, self.gwmt, self.gdnt = _calculate(self.lemmas, self.stems)
|
||||
self.ui, self.oi, self.sw = _indexes(self.gumt, self.gdmt, self.gwmt, self.gdnt)
|
||||
self.errt = self._errt()
|
||||
|
||||
|
||||
def demo():
|
||||
"""Demonstration of the module."""
|
||||
# Some words with their real lemmas
|
||||
lemmas = {
|
||||
"kneel": ["kneel", "knelt"],
|
||||
"range": ["range", "ranged"],
|
||||
"ring": ["ring", "rang", "rung"],
|
||||
}
|
||||
# Same words with stems from a stemming algorithm
|
||||
stems = {
|
||||
"kneel": ["kneel"],
|
||||
"knelt": ["knelt"],
|
||||
"rang": ["rang", "range", "ranged"],
|
||||
"ring": ["ring"],
|
||||
"rung": ["rung"],
|
||||
}
|
||||
print("Words grouped by their lemmas:")
|
||||
for lemma in sorted(lemmas):
|
||||
print("{} => {}".format(lemma, " ".join(lemmas[lemma])))
|
||||
print()
|
||||
print("Same words grouped by a stemming algorithm:")
|
||||
for stem in sorted(stems):
|
||||
print("{} => {}".format(stem, " ".join(stems[stem])))
|
||||
print()
|
||||
p = Paice(lemmas, stems)
|
||||
print(p)
|
||||
print()
|
||||
# Let's "change" results from a stemming algorithm
|
||||
stems = {
|
||||
"kneel": ["kneel"],
|
||||
"knelt": ["knelt"],
|
||||
"rang": ["rang"],
|
||||
"range": ["range", "ranged"],
|
||||
"ring": ["ring"],
|
||||
"rung": ["rung"],
|
||||
}
|
||||
print("Counting stats after changing stemming results:")
|
||||
for stem in sorted(stems):
|
||||
print("{} => {}".format(stem, " ".join(stems[stem])))
|
||||
print()
|
||||
p.stems = stems
|
||||
p.update()
|
||||
print(p)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
228
backend/venv/Lib/site-packages/nltk/metrics/scores.py
Normal file
228
backend/venv/Lib/site-packages/nltk/metrics/scores.py
Normal file
@@ -0,0 +1,228 @@
|
||||
# Natural Language Toolkit: Evaluation
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import operator
|
||||
from functools import reduce
|
||||
from math import fabs
|
||||
from random import shuffle
|
||||
|
||||
try:
|
||||
from scipy.stats.stats import betai
|
||||
except ImportError:
|
||||
betai = None
|
||||
|
||||
from nltk.util import LazyConcatenation, LazyMap
|
||||
|
||||
|
||||
def accuracy(reference, test):
|
||||
"""
|
||||
Given a list of reference values and a corresponding list of test
|
||||
values, return the fraction of corresponding values that are
|
||||
equal. In particular, return the fraction of indices
|
||||
``0<i<=len(test)`` such that ``test[i] == reference[i]``.
|
||||
|
||||
:type reference: list
|
||||
:param reference: An ordered list of reference values.
|
||||
:type test: list
|
||||
:param test: A list of values to compare against the corresponding
|
||||
reference values.
|
||||
:raise ValueError: If ``reference`` and ``length`` do not have the
|
||||
same length.
|
||||
"""
|
||||
if len(reference) != len(test):
|
||||
raise ValueError("Lists must have the same length.")
|
||||
return sum(x == y for x, y in zip(reference, test)) / len(test)
|
||||
|
||||
|
||||
def precision(reference, test):
|
||||
"""
|
||||
Given a set of reference values and a set of test values, return
|
||||
the fraction of test values that appear in the reference set.
|
||||
In particular, return card(``reference`` intersection ``test``)/card(``test``).
|
||||
If ``test`` is empty, then return None.
|
||||
|
||||
:type reference: set
|
||||
:param reference: A set of reference values.
|
||||
:type test: set
|
||||
:param test: A set of values to compare against the reference set.
|
||||
:rtype: float or None
|
||||
"""
|
||||
if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
|
||||
raise TypeError("reference and test should be sets")
|
||||
|
||||
if len(test) == 0:
|
||||
return None
|
||||
else:
|
||||
return len(reference.intersection(test)) / len(test)
|
||||
|
||||
|
||||
def recall(reference, test):
|
||||
"""
|
||||
Given a set of reference values and a set of test values, return
|
||||
the fraction of reference values that appear in the test set.
|
||||
In particular, return card(``reference`` intersection ``test``)/card(``reference``).
|
||||
If ``reference`` is empty, then return None.
|
||||
|
||||
:type reference: set
|
||||
:param reference: A set of reference values.
|
||||
:type test: set
|
||||
:param test: A set of values to compare against the reference set.
|
||||
:rtype: float or None
|
||||
"""
|
||||
if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
|
||||
raise TypeError("reference and test should be sets")
|
||||
|
||||
if len(reference) == 0:
|
||||
return None
|
||||
else:
|
||||
return len(reference.intersection(test)) / len(reference)
|
||||
|
||||
|
||||
def f_measure(reference, test, alpha=0.5):
|
||||
"""
|
||||
Given a set of reference values and a set of test values, return
|
||||
the f-measure of the test values, when compared against the
|
||||
reference values. The f-measure is the harmonic mean of the
|
||||
``precision`` and ``recall``, weighted by ``alpha``. In particular,
|
||||
given the precision *p* and recall *r* defined by:
|
||||
|
||||
- *p* = card(``reference`` intersection ``test``)/card(``test``)
|
||||
- *r* = card(``reference`` intersection ``test``)/card(``reference``)
|
||||
|
||||
The f-measure is:
|
||||
|
||||
- *1/(alpha/p + (1-alpha)/r)*
|
||||
|
||||
If either ``reference`` or ``test`` is empty, then ``f_measure``
|
||||
returns None.
|
||||
|
||||
:type reference: set
|
||||
:param reference: A set of reference values.
|
||||
:type test: set
|
||||
:param test: A set of values to compare against the reference set.
|
||||
:rtype: float or None
|
||||
"""
|
||||
p = precision(reference, test)
|
||||
r = recall(reference, test)
|
||||
if p is None or r is None:
|
||||
return None
|
||||
if p == 0 or r == 0:
|
||||
return 0
|
||||
return 1.0 / (alpha / p + (1 - alpha) / r)
|
||||
|
||||
|
||||
def log_likelihood(reference, test):
|
||||
"""
|
||||
Given a list of reference values and a corresponding list of test
|
||||
probability distributions, return the average log likelihood of
|
||||
the reference values, given the probability distributions.
|
||||
|
||||
:param reference: A list of reference values
|
||||
:type reference: list
|
||||
:param test: A list of probability distributions over values to
|
||||
compare against the corresponding reference values.
|
||||
:type test: list(ProbDistI)
|
||||
"""
|
||||
if len(reference) != len(test):
|
||||
raise ValueError("Lists must have the same length.")
|
||||
|
||||
# Return the average value of dist.logprob(val).
|
||||
total_likelihood = sum(dist.logprob(val) for (val, dist) in zip(reference, test))
|
||||
return total_likelihood / len(reference)
|
||||
|
||||
|
||||
def approxrand(a, b, **kwargs):
|
||||
"""
|
||||
Returns an approximate significance level between two lists of
|
||||
independently generated test values.
|
||||
|
||||
Approximate randomization calculates significance by randomly drawing
|
||||
from a sample of the possible permutations. At the limit of the number
|
||||
of possible permutations, the significance level is exact. The
|
||||
approximate significance level is the sample mean number of times the
|
||||
statistic of the permutated lists varies from the actual statistic of
|
||||
the unpermuted argument lists.
|
||||
|
||||
:return: a tuple containing an approximate significance level, the count
|
||||
of the number of times the pseudo-statistic varied from the
|
||||
actual statistic, and the number of shuffles
|
||||
:rtype: tuple
|
||||
:param a: a list of test values
|
||||
:type a: list
|
||||
:param b: another list of independently generated test values
|
||||
:type b: list
|
||||
"""
|
||||
shuffles = kwargs.get("shuffles", 999)
|
||||
# there's no point in trying to shuffle beyond all possible permutations
|
||||
shuffles = min(shuffles, reduce(operator.mul, range(1, len(a) + len(b) + 1)))
|
||||
stat = kwargs.get("statistic", lambda lst: sum(lst) / len(lst))
|
||||
verbose = kwargs.get("verbose", False)
|
||||
|
||||
if verbose:
|
||||
print("shuffles: %d" % shuffles)
|
||||
|
||||
actual_stat = fabs(stat(a) - stat(b))
|
||||
|
||||
if verbose:
|
||||
print("actual statistic: %f" % actual_stat)
|
||||
print("-" * 60)
|
||||
|
||||
c = 1e-100
|
||||
lst = LazyConcatenation([a, b])
|
||||
indices = list(range(len(a) + len(b)))
|
||||
|
||||
for i in range(shuffles):
|
||||
if verbose and i % 10 == 0:
|
||||
print("shuffle: %d" % i)
|
||||
|
||||
shuffle(indices)
|
||||
|
||||
pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[: len(a)]))
|
||||
pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a) :]))
|
||||
pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b)
|
||||
|
||||
if pseudo_stat >= actual_stat:
|
||||
c += 1
|
||||
|
||||
if verbose and i % 10 == 0:
|
||||
print("pseudo-statistic: %f" % pseudo_stat)
|
||||
print("significance: %f" % ((c + 1) / (i + 1)))
|
||||
print("-" * 60)
|
||||
|
||||
significance = (c + 1) / (shuffles + 1)
|
||||
|
||||
if verbose:
|
||||
print("significance: %f" % significance)
|
||||
if betai:
|
||||
for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
|
||||
print(f"prob(phi<={phi:f}): {betai(c, shuffles, phi):f}")
|
||||
|
||||
return (significance, c, shuffles)
|
||||
|
||||
|
||||
def demo():
|
||||
print("-" * 75)
|
||||
reference = "DET NN VB DET JJ NN NN IN DET NN".split()
|
||||
test = "DET VB VB DET NN NN NN IN DET NN".split()
|
||||
print("Reference =", reference)
|
||||
print("Test =", test)
|
||||
print("Accuracy:", accuracy(reference, test))
|
||||
|
||||
print("-" * 75)
|
||||
reference_set = set(reference)
|
||||
test_set = set(test)
|
||||
print("Reference =", reference_set)
|
||||
print("Test = ", test_set)
|
||||
print("Precision:", precision(reference_set, test_set))
|
||||
print(" Recall:", recall(reference_set, test_set))
|
||||
print("F-Measure:", f_measure(reference_set, test_set))
|
||||
print("-" * 75)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
222
backend/venv/Lib/site-packages/nltk/metrics/segmentation.py
Normal file
222
backend/venv/Lib/site-packages/nltk/metrics/segmentation.py
Normal file
@@ -0,0 +1,222 @@
|
||||
# Natural Language Toolkit: Text Segmentation Metrics
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# David Doukhan <david.doukhan@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
"""
|
||||
Text Segmentation Metrics
|
||||
|
||||
1. Windowdiff
|
||||
|
||||
Pevzner, L., and Hearst, M., A Critique and Improvement of
|
||||
an Evaluation Metric for Text Segmentation,
|
||||
Computational Linguistics 28, 19-36
|
||||
|
||||
|
||||
2. Generalized Hamming Distance
|
||||
|
||||
Bookstein A., Kulyukin V.A., Raita T.
|
||||
Generalized Hamming Distance
|
||||
Information Retrieval 5, 2002, pp 353-375
|
||||
|
||||
Baseline implementation in C++
|
||||
http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html
|
||||
|
||||
Study describing benefits of Generalized Hamming Distance Versus
|
||||
WindowDiff for evaluating text segmentation tasks
|
||||
Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ?
|
||||
TALN 2009
|
||||
|
||||
|
||||
3. Pk text segmentation metric
|
||||
|
||||
Beeferman D., Berger A., Lafferty J. (1999)
|
||||
Statistical Models for Text Segmentation
|
||||
Machine Learning, 34, 177-210
|
||||
"""
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
|
||||
"""
|
||||
Compute the windowdiff score for a pair of segmentations. A
|
||||
segmentation is any sequence over a vocabulary of two items
|
||||
(e.g. "0", "1"), where the specified boundary value is used to
|
||||
mark the edge of a segmentation.
|
||||
|
||||
>>> s1 = "000100000010"
|
||||
>>> s2 = "000010000100"
|
||||
>>> s3 = "100000010000"
|
||||
>>> '%.2f' % windowdiff(s1, s1, 3)
|
||||
'0.00'
|
||||
>>> '%.2f' % windowdiff(s1, s2, 3)
|
||||
'0.30'
|
||||
>>> '%.2f' % windowdiff(s2, s3, 3)
|
||||
'0.80'
|
||||
|
||||
:param seg1: a segmentation
|
||||
:type seg1: str or list
|
||||
:param seg2: a segmentation
|
||||
:type seg2: str or list
|
||||
:param k: window width
|
||||
:type k: int
|
||||
:param boundary: boundary value
|
||||
:type boundary: str or int or bool
|
||||
:param weighted: use the weighted variant of windowdiff
|
||||
:type weighted: boolean
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
if len(seg1) != len(seg2):
|
||||
raise ValueError("Segmentations have unequal length")
|
||||
if k > len(seg1):
|
||||
raise ValueError(
|
||||
"Window width k should be smaller or equal than segmentation lengths"
|
||||
)
|
||||
wd = 0
|
||||
for i in range(len(seg1) - k + 1):
|
||||
ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary))
|
||||
if weighted:
|
||||
wd += ndiff
|
||||
else:
|
||||
wd += min(1, ndiff)
|
||||
return wd / (len(seg1) - k + 1.0)
|
||||
|
||||
|
||||
# Generalized Hamming Distance
|
||||
|
||||
|
||||
def _init_mat(nrows, ncols, ins_cost, del_cost):
|
||||
mat = np.empty((nrows, ncols))
|
||||
mat[0, :] = ins_cost * np.arange(ncols)
|
||||
mat[:, 0] = del_cost * np.arange(nrows)
|
||||
return mat
|
||||
|
||||
|
||||
def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff):
|
||||
for i, rowi in enumerate(rowv):
|
||||
for j, colj in enumerate(colv):
|
||||
shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j]
|
||||
if rowi == colj:
|
||||
# boundaries are at the same location, no transformation required
|
||||
tcost = mat[i, j]
|
||||
elif rowi > colj:
|
||||
# boundary match through a deletion
|
||||
tcost = del_cost + mat[i, j + 1]
|
||||
else:
|
||||
# boundary match through an insertion
|
||||
tcost = ins_cost + mat[i + 1, j]
|
||||
mat[i + 1, j + 1] = min(tcost, shift_cost)
|
||||
|
||||
|
||||
def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"):
|
||||
"""
|
||||
Compute the Generalized Hamming Distance for a reference and a hypothetical
|
||||
segmentation, corresponding to the cost related to the transformation
|
||||
of the hypothetical segmentation into the reference segmentation
|
||||
through boundary insertion, deletion and shift operations.
|
||||
|
||||
A segmentation is any sequence over a vocabulary of two items
|
||||
(e.g. "0", "1"), where the specified boundary value is used to
|
||||
mark the edge of a segmentation.
|
||||
|
||||
Recommended parameter values are a shift_cost_coeff of 2.
|
||||
Associated with a ins_cost, and del_cost equal to the mean segment
|
||||
length in the reference segmentation.
|
||||
|
||||
>>> # Same examples as Kulyukin C++ implementation
|
||||
>>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
|
||||
0.5
|
||||
>>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
|
||||
2.0
|
||||
>>> ghd('011', '110', 1.0, 1.0, 0.5)
|
||||
1.0
|
||||
>>> ghd('1', '0', 1.0, 1.0, 0.5)
|
||||
1.0
|
||||
>>> ghd('111', '000', 1.0, 1.0, 0.5)
|
||||
3.0
|
||||
>>> ghd('000', '111', 1.0, 2.0, 0.5)
|
||||
6.0
|
||||
|
||||
:param ref: the reference segmentation
|
||||
:type ref: str or list
|
||||
:param hyp: the hypothetical segmentation
|
||||
:type hyp: str or list
|
||||
:param ins_cost: insertion cost
|
||||
:type ins_cost: float
|
||||
:param del_cost: deletion cost
|
||||
:type del_cost: float
|
||||
:param shift_cost_coeff: constant used to compute the cost of a shift.
|
||||
``shift cost = shift_cost_coeff * |i - j|`` where ``i`` and ``j``
|
||||
are the positions indicating the shift
|
||||
:type shift_cost_coeff: float
|
||||
:param boundary: boundary value
|
||||
:type boundary: str or int or bool
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
ref_idx = [i for (i, val) in enumerate(ref) if val == boundary]
|
||||
hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary]
|
||||
|
||||
nref_bound = len(ref_idx)
|
||||
nhyp_bound = len(hyp_idx)
|
||||
|
||||
if nref_bound == 0 and nhyp_bound == 0:
|
||||
return 0.0
|
||||
elif nref_bound > 0 and nhyp_bound == 0:
|
||||
return nref_bound * ins_cost
|
||||
elif nref_bound == 0 and nhyp_bound > 0:
|
||||
return nhyp_bound * del_cost
|
||||
|
||||
mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost)
|
||||
_ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff)
|
||||
return float(mat[-1, -1])
|
||||
|
||||
|
||||
# Beeferman's Pk text segmentation evaluation metric
|
||||
|
||||
|
||||
def pk(ref, hyp, k=None, boundary="1"):
|
||||
"""
|
||||
Compute the Pk metric for a pair of segmentations A segmentation
|
||||
is any sequence over a vocabulary of two items (e.g. "0", "1"),
|
||||
where the specified boundary value is used to mark the edge of a
|
||||
segmentation.
|
||||
|
||||
>>> '%.2f' % pk('0100'*100, '1'*400, 2)
|
||||
'0.50'
|
||||
>>> '%.2f' % pk('0100'*100, '0'*400, 2)
|
||||
'0.50'
|
||||
>>> '%.2f' % pk('0100'*100, '0100'*100, 2)
|
||||
'0.00'
|
||||
|
||||
:param ref: the reference segmentation
|
||||
:type ref: str or list
|
||||
:param hyp: the segmentation to evaluate
|
||||
:type hyp: str or list
|
||||
:param k: window size, if None, set to half of the average reference segment length
|
||||
:type boundary: str or int or bool
|
||||
:param boundary: boundary value
|
||||
:type boundary: str or int or bool
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
if k is None:
|
||||
k = int(round(len(ref) / (ref.count(boundary) * 2.0)))
|
||||
|
||||
err = 0
|
||||
for i in range(len(ref) - k + 1):
|
||||
r = ref[i : i + k].count(boundary) > 0
|
||||
h = hyp[i : i + k].count(boundary) > 0
|
||||
if r != h:
|
||||
err += 1
|
||||
return err / (len(ref) - k + 1.0)
|
||||
68
backend/venv/Lib/site-packages/nltk/metrics/spearman.py
Normal file
68
backend/venv/Lib/site-packages/nltk/metrics/spearman.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# Natural Language Toolkit: Spearman Rank Correlation
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Tools for comparing ranked lists.
|
||||
"""
|
||||
|
||||
|
||||
def _rank_dists(ranks1, ranks2):
|
||||
"""Finds the difference between the values in ranks1 and ranks2 for keys
|
||||
present in both dicts. If the arguments are not dicts, they are converted
|
||||
from (key, rank) sequences.
|
||||
"""
|
||||
ranks1 = dict(ranks1)
|
||||
ranks2 = dict(ranks2)
|
||||
for k in ranks1:
|
||||
try:
|
||||
yield k, ranks1[k] - ranks2[k]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
|
||||
def spearman_correlation(ranks1, ranks2):
|
||||
"""Returns the Spearman correlation coefficient for two rankings, which
|
||||
should be dicts or sequences of (key, rank). The coefficient ranges from
|
||||
-1.0 (ranks are opposite) to 1.0 (ranks are identical), and is only
|
||||
calculated for keys in both rankings (for meaningful results, remove keys
|
||||
present in only one list before ranking)."""
|
||||
n = 0
|
||||
res = 0
|
||||
for k, d in _rank_dists(ranks1, ranks2):
|
||||
res += d * d
|
||||
n += 1
|
||||
try:
|
||||
return 1 - (6 * res / (n * (n * n - 1)))
|
||||
except ZeroDivisionError:
|
||||
# Result is undefined if only one item is ranked
|
||||
return 0.0
|
||||
|
||||
|
||||
def ranks_from_sequence(seq):
|
||||
"""Given a sequence, yields each element with an increasing rank, suitable
|
||||
for use as an argument to ``spearman_correlation``.
|
||||
"""
|
||||
return ((k, i) for i, k in enumerate(seq))
|
||||
|
||||
|
||||
def ranks_from_scores(scores, rank_gap=1e-15):
|
||||
"""Given a sequence of (key, score) tuples, yields each key with an
|
||||
increasing rank, tying with previous key's rank if the difference between
|
||||
their scores is less than rank_gap. Suitable for use as an argument to
|
||||
``spearman_correlation``.
|
||||
"""
|
||||
prev_score = None
|
||||
rank = 0
|
||||
for i, (key, score) in enumerate(scores):
|
||||
try:
|
||||
if abs(score - prev_score) > rank_gap:
|
||||
rank = i
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
yield key, rank
|
||||
prev_score = score
|
||||
Reference in New Issue
Block a user