Initial commit

2026-02-01 09:31:38 +01:00
commit e02db93960
4396 changed files with 1511612 additions and 0 deletions
--- a/backend/venv/Lib/site-packages/nltk/metrics/init.py
+++ b/backend/venv/Lib/site-packages/nltk/metrics/init.py
@@ -0,0 +1,51 @@
+# Natural Language Toolkit: Metrics
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+#         Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+"""
+NLTK Metrics
+
+Classes and methods for scoring processing modules.
+"""
+
+from nltk.metrics.agreement import AnnotationTask
+from nltk.metrics.aline import align
+from nltk.metrics.association import (
+    BigramAssocMeasures,
+    ContingencyMeasures,
+    NgramAssocMeasures,
+    QuadgramAssocMeasures,
+    TrigramAssocMeasures,
+)
+from nltk.metrics.confusionmatrix import ConfusionMatrix
+from nltk.metrics.distance import (
+    binary_distance,
+    custom_distance,
+    edit_distance,
+    edit_distance_align,
+    fractional_presence,
+    interval_distance,
+    jaccard_distance,
+    masi_distance,
+    presence,
+)
+from nltk.metrics.paice import Paice
+from nltk.metrics.scores import (
+    accuracy,
+    approxrand,
+    f_measure,
+    log_likelihood,
+    precision,
+    recall,
+)
+from nltk.metrics.segmentation import ghd, pk, windowdiff
+from nltk.metrics.spearman import (
+    ranks_from_scores,
+    ranks_from_sequence,
+    spearman_correlation,
+)
--- a/backend/venv/Lib/site-packages/nltk/metrics/agreement.py
+++ b/backend/venv/Lib/site-packages/nltk/metrics/agreement.py
@@ -0,0 +1,467 @@
+# Natural Language Toolkit: Agreement Metrics
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Tom Lippincott <tom@cs.columbia.edu>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+"""
+Implementations of inter-annotator agreement coefficients surveyed by Artstein
+and Poesio (2007), Inter-Coder Agreement for Computational Linguistics.
+
+An agreement coefficient calculates the amount that annotators agreed on label
+assignments beyond what is expected by chance.
+
+In defining the AnnotationTask class, we use naming conventions similar to the
+paper's terminology.  There are three types of objects in an annotation task:
+
+    the coders (variables "c" and "C")
+    the items to be annotated (variables "i" and "I")
+    the potential categories to be assigned (variables "k" and "K")
+
+Additionally, it is often the case that we don't want to treat two different
+labels as complete disagreement, and so the AnnotationTask constructor can also
+take a distance metric as a final argument.  Distance metrics are simply
+functions that take two arguments, and return a value between 0.0 and 1.0
+indicating the distance between them.  If not supplied, the default is binary
+comparison between the arguments.
+
+The simplest way to initialize an AnnotationTask is with a list of triples,
+each containing a coder's assignment for one object in the task:
+
+    task = AnnotationTask(data=[('c1', '1', 'v1'),('c2', '1', 'v1'),...])
+
+Note that the data list needs to contain the same number of triples for each
+individual coder, containing category values for the same set of items.
+
+Alpha (Krippendorff 1980)
+Kappa (Cohen 1960)
+S (Bennet, Albert and Goldstein 1954)
+Pi (Scott 1955)
+
+
+TODO: Describe handling of multiple coders and missing data
+
+Expected results from the Artstein and Poesio survey paper:
+
+    >>> from nltk.metrics.agreement import AnnotationTask
+    >>> import os.path
+    >>> t = AnnotationTask(data=[x.split() for x in open(os.path.join(os.path.dirname(__file__), "artstein_poesio_example.txt"))])
+    >>> t.avg_Ao()
+    0.88
+    >>> round(t.pi(), 5)
+    0.79953
+    >>> round(t.S(), 2)
+    0.82
+
+    This would have returned a wrong value (0.0) in @785fb79 as coders are in
+    the wrong order. Subsequently, all values for pi(), S(), and kappa() would
+    have been wrong as they are computed with avg_Ao().
+    >>> t2 = AnnotationTask(data=[('b','1','stat'),('a','1','stat')])
+    >>> t2.avg_Ao()
+    1.0
+
+    The following, of course, also works.
+    >>> t3 = AnnotationTask(data=[('a','1','othr'),('b','1','othr')])
+    >>> t3.avg_Ao()
+    1.0
+
+"""
+
+import logging
+from itertools import groupby
+from operator import itemgetter
+
+from nltk.internals import deprecated
+from nltk.metrics.distance import binary_distance
+from nltk.probability import ConditionalFreqDist, FreqDist
+
+log = logging.getLogger(__name__)
+
+
+class AnnotationTask:
+    """Represents an annotation task, i.e. people assign labels to items.
+
+    Notation tries to match notation in Artstein and Poesio (2007).
+
+    In general, coders and items can be represented as any hashable object.
+    Integers, for example, are fine, though strings are more readable.
+    Labels must support the distance functions applied to them, so e.g.
+    a string-edit-distance makes no sense if your labels are integers,
+    whereas interval distance needs numeric values.  A notable case of this
+    is the MASI metric, which requires Python sets.
+    """
+
+    def __init__(self, data=None, distance=binary_distance):
+        """Initialize an annotation task.
+
+        The data argument can be None (to create an empty annotation task) or a sequence of 3-tuples,
+        each representing a coder's labeling of an item:
+        ``(coder,item,label)``
+
+        The distance argument is a function taking two arguments (labels) and producing a numerical distance.
+        The distance from a label to itself should be zero:
+        ``distance(l,l) = 0``
+        """
+        self.distance = distance
+        self.I = set()
+        self.K = set()
+        self.C = set()
+        self.data = []
+        if data is not None:
+            self.load_array(data)
+
+    def __str__(self):
+        return "\r\n".join(
+            map(
+                lambda x: "%s\t%s\t%s"
+                % (x["coder"], x["item"].replace("_", "\t"), ",".join(x["labels"])),
+                self.data,
+            )
+        )
+
+    def load_array(self, array):
+        """Load an sequence of annotation results, appending to any data already loaded.
+
+        The argument is a sequence of 3-tuples, each representing a coder's labeling of an item:
+            (coder,item,label)
+        """
+        for coder, item, labels in array:
+            self.C.add(coder)
+            self.K.add(labels)
+            self.I.add(item)
+            self.data.append({"coder": coder, "labels": labels, "item": item})
+
+    def agr(self, cA, cB, i, data=None):
+        """Agreement between two coders on a given item"""
+        data = data or self.data
+        # cfedermann: we don't know what combination of coder/item will come
+        # first in x; to avoid StopIteration problems due to assuming an order
+        # cA,cB, we allow either for k1 and then look up the missing as k2.
+        k1 = next(x for x in data if x["coder"] in (cA, cB) and x["item"] == i)
+        if k1["coder"] == cA:
+            k2 = next(x for x in data if x["coder"] == cB and x["item"] == i)
+        else:
+            k2 = next(x for x in data if x["coder"] == cA and x["item"] == i)
+
+        ret = 1.0 - float(self.distance(k1["labels"], k2["labels"]))
+        log.debug("Observed agreement between %s and %s on %s: %f", cA, cB, i, ret)
+        log.debug(
+            'Distance between "%r" and "%r": %f', k1["labels"], k2["labels"], 1.0 - ret
+        )
+        return ret
+
+    def Nk(self, k):
+        return float(sum(1 for x in self.data if x["labels"] == k))
+
+    def Nik(self, i, k):
+        return float(sum(1 for x in self.data if x["item"] == i and x["labels"] == k))
+
+    def Nck(self, c, k):
+        return float(sum(1 for x in self.data if x["coder"] == c and x["labels"] == k))
+
+    @deprecated("Use Nk, Nik or Nck instead")
+    def N(self, k=None, i=None, c=None):
+        """Implements the "n-notation" used in Artstein and Poesio (2007)"""
+        if k is not None and i is None and c is None:
+            ret = self.Nk(k)
+        elif k is not None and i is not None and c is None:
+            ret = self.Nik(i, k)
+        elif k is not None and c is not None and i is None:
+            ret = self.Nck(c, k)
+        else:
+            raise ValueError(
+                f"You must pass either i or c, not both! (k={k!r},i={i!r},c={c!r})"
+            )
+        log.debug("Count on N[%s,%s,%s]: %d", k, i, c, ret)
+        return ret
+
+    def _grouped_data(self, field, data=None):
+        data = data or self.data
+        return groupby(sorted(data, key=itemgetter(field)), itemgetter(field))
+
+    def Ao(self, cA, cB):
+        """Observed agreement between two coders on all items."""
+        data = self._grouped_data(
+            "item", (x for x in self.data if x["coder"] in (cA, cB))
+        )
+        ret = sum(self.agr(cA, cB, item, item_data) for item, item_data in data) / len(
+            self.I
+        )
+        log.debug("Observed agreement between %s and %s: %f", cA, cB, ret)
+        return ret
+
+    def _pairwise_average(self, function):
+        """
+        Calculates the average of function results for each coder pair
+        """
+        total = 0
+        n = 0
+        s = self.C.copy()
+        for cA in self.C:
+            s.remove(cA)
+            for cB in s:
+                total += function(cA, cB)
+                n += 1
+        ret = total / n
+        return ret
+
+    def avg_Ao(self):
+        """Average observed agreement across all coders and items."""
+        ret = self._pairwise_average(self.Ao)
+        log.debug("Average observed agreement: %f", ret)
+        return ret
+
+    def Do_Kw_pairwise(self, cA, cB, max_distance=1.0):
+        """The observed disagreement for the weighted kappa coefficient."""
+        total = 0.0
+        data = (x for x in self.data if x["coder"] in (cA, cB))
+        for i, itemdata in self._grouped_data("item", data):
+            # we should have two items; distance doesn't care which comes first
+            total += self.distance(next(itemdata)["labels"], next(itemdata)["labels"])
+
+        ret = total / (len(self.I) * max_distance)
+        log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret)
+        return ret
+
+    def Do_Kw(self, max_distance=1.0):
+        """Averaged over all labelers"""
+        ret = self._pairwise_average(
+            lambda cA, cB: self.Do_Kw_pairwise(cA, cB, max_distance)
+        )
+        log.debug("Observed disagreement: %f", ret)
+        return ret
+
+    # Agreement Coefficients
+    def S(self):
+        """Bennett, Albert and Goldstein 1954"""
+        Ae = 1.0 / len(self.K)
+        ret = (self.avg_Ao() - Ae) / (1.0 - Ae)
+        return ret
+
+    def pi(self):
+        """Scott 1955; here, multi-pi.
+        Equivalent to K from Siegel and Castellan (1988).
+
+        """
+        total = 0.0
+        label_freqs = FreqDist(x["labels"] for x in self.data)
+        for k, f in label_freqs.items():
+            total += f**2
+        Ae = total / ((len(self.I) * len(self.C)) ** 2)
+        return (self.avg_Ao() - Ae) / (1 - Ae)
+
+    def Ae_kappa(self, cA, cB):
+        Ae = 0.0
+        nitems = float(len(self.I))
+        label_freqs = ConditionalFreqDist((x["labels"], x["coder"]) for x in self.data)
+        for k in label_freqs.conditions():
+            Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
+        return Ae
+
+    def kappa_pairwise(self, cA, cB):
+        """ """
+        Ae = self.Ae_kappa(cA, cB)
+        ret = (self.Ao(cA, cB) - Ae) / (1.0 - Ae)
+        log.debug("Expected agreement between %s and %s: %f", cA, cB, Ae)
+        return ret
+
+    def kappa(self):
+        """Cohen 1960
+        Averages naively over kappas for each coder pair.
+
+        """
+        return self._pairwise_average(self.kappa_pairwise)
+
+    def multi_kappa(self):
+        """Davies and Fleiss 1982
+        Averages over observed and expected agreements for each coder pair.
+
+        """
+        Ae = self._pairwise_average(self.Ae_kappa)
+        return (self.avg_Ao() - Ae) / (1.0 - Ae)
+
+    def Disagreement(self, label_freqs):
+        total_labels = sum(label_freqs.values())
+        pairs = 0.0
+        for j, nj in label_freqs.items():
+            for l, nl in label_freqs.items():
+                pairs += float(nj * nl) * self.distance(l, j)
+        return 1.0 * pairs / (total_labels * (total_labels - 1))
+
+    def alpha(self):
+        """Krippendorff 1980"""
+        # check for degenerate cases
+        if len(self.K) == 0:
+            raise ValueError("Cannot calculate alpha, no data present!")
+        if len(self.K) == 1:
+            log.debug("Only one annotation value, alpha returning 1.")
+            return 1
+        if len(self.C) == 1 and len(self.I) == 1:
+            raise ValueError("Cannot calculate alpha, only one coder and item present!")
+
+        total_disagreement = 0.0
+        total_ratings = 0
+        all_valid_labels_freq = FreqDist([])
+        total_do = 0.0  # Total observed disagreement for all items.
+        for i, itemdata in self._grouped_data("item"):
+            label_freqs = FreqDist(x["labels"] for x in itemdata)
+            labels_count = sum(label_freqs.values())
+            if labels_count < 2:
+                # Ignore the item.
+                continue
+            all_valid_labels_freq += label_freqs
+            total_do += self.Disagreement(label_freqs) * labels_count
+
+        if len(all_valid_labels_freq.keys()) == 1:
+            log.debug("Only one valid annotation value, alpha returning 1.")
+            return 1
+
+        do = total_do / sum(all_valid_labels_freq.values())
+
+        de = self.Disagreement(all_valid_labels_freq)  # Expected disagreement.
+        k_alpha = 1.0 - do / de
+
+        return k_alpha
+
+    def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
+        """Cohen 1968"""
+        total = 0.0
+        label_freqs = ConditionalFreqDist(
+            (x["coder"], x["labels"]) for x in self.data if x["coder"] in (cA, cB)
+        )
+        for j in self.K:
+            for l in self.K:
+                total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
+        De = total / (max_distance * pow(len(self.I), 2))
+        log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
+        Do = self.Do_Kw_pairwise(cA, cB)
+        ret = 1.0 - (Do / De)
+        return ret
+
+    def weighted_kappa(self, max_distance=1.0):
+        """Cohen 1968"""
+        return self._pairwise_average(
+            lambda cA, cB: self.weighted_kappa_pairwise(cA, cB, max_distance)
+        )
+
+
+if __name__ == "__main__":
+    import optparse
+    import re
+
+    from nltk.metrics import distance
+
+    # process command-line arguments
+    parser = optparse.OptionParser()
+    parser.add_option(
+        "-d",
+        "--distance",
+        dest="distance",
+        default="binary_distance",
+        help="distance metric to use",
+    )
+    parser.add_option(
+        "-a",
+        "--agreement",
+        dest="agreement",
+        default="kappa",
+        help="agreement coefficient to calculate",
+    )
+    parser.add_option(
+        "-e",
+        "--exclude",
+        dest="exclude",
+        action="append",
+        default=[],
+        help="coder names to exclude (may be specified multiple times)",
+    )
+    parser.add_option(
+        "-i",
+        "--include",
+        dest="include",
+        action="append",
+        default=[],
+        help="coder names to include, same format as exclude",
+    )
+    parser.add_option(
+        "-f",
+        "--file",
+        dest="file",
+        help="file to read labelings from, each line with three columns: 'labeler item labels'",
+    )
+    parser.add_option(
+        "-v",
+        "--verbose",
+        dest="verbose",
+        default="0",
+        help="how much debugging to print on stderr (0-4)",
+    )
+    parser.add_option(
+        "-c",
+        "--columnsep",
+        dest="columnsep",
+        default="\t",
+        help="char/string that separates the three columns in the file, defaults to tab",
+    )
+    parser.add_option(
+        "-l",
+        "--labelsep",
+        dest="labelsep",
+        default=",",
+        help="char/string that separates labels (if labelers can assign more than one), defaults to comma",
+    )
+    parser.add_option(
+        "-p",
+        "--presence",
+        dest="presence",
+        default=None,
+        help="convert each labeling into 1 or 0, based on presence of LABEL",
+    )
+    parser.add_option(
+        "-T",
+        "--thorough",
+        dest="thorough",
+        default=False,
+        action="store_true",
+        help="calculate agreement for every subset of the annotators",
+    )
+    (options, remainder) = parser.parse_args()
+
+    if not options.file:
+        parser.print_help()
+        exit()
+
+    logging.basicConfig(level=50 - 10 * int(options.verbose))
+
+    # read in data from the specified file
+    data = []
+    with open(options.file) as infile:
+        for l in infile:
+            toks = l.split(options.columnsep)
+            coder, object_, labels = (
+                toks[0],
+                str(toks[1:-1]),
+                frozenset(toks[-1].strip().split(options.labelsep)),
+            )
+            if (
+                (options.include == options.exclude)
+                or (len(options.include) > 0 and coder in options.include)
+                or (len(options.exclude) > 0 and coder not in options.exclude)
+            ):
+                data.append((coder, object_, labels))
+
+    if options.presence:
+        task = AnnotationTask(
+            data, getattr(distance, options.distance)(options.presence)
+        )
+    else:
+        task = AnnotationTask(data, getattr(distance, options.distance))
+
+    if options.thorough:
+        pass
+    else:
+        print(getattr(task, options.agreement)())
+
+    logging.shutdown()
--- a/backend/venv/Lib/site-packages/nltk/metrics/aline.py
+++ b/backend/venv/Lib/site-packages/nltk/metrics/aline.py
--- a/backend/venv/Lib/site-packages/nltk/metrics/association.py
+++ b/backend/venv/Lib/site-packages/nltk/metrics/association.py
@@ -0,0 +1,476 @@
+# Natural Language Toolkit: Ngram Association Measures
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Joel Nothman <jnothman@student.usyd.edu.au>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Provides scoring functions for a number of association measures through a
+generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
+``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
+"""
+
+import math as _math
+from abc import ABCMeta, abstractmethod
+from functools import reduce
+
+_log2 = lambda x: _math.log2(x)
+_ln = _math.log
+
+_product = lambda s: reduce(lambda x, y: x * y, s)
+
+_SMALL = 1e-20
+
+try:
+    from scipy.stats import fisher_exact
+except ImportError:
+
+    def fisher_exact(*_args, **_kwargs):
+        raise NotImplementedError
+
+
+### Indices to marginals arguments:
+
+NGRAM = 0
+"""Marginals index for the ngram count"""
+
+UNIGRAMS = -2
+"""Marginals index for a tuple of each unigram count"""
+
+TOTAL = -1
+"""Marginals index for the number of words in the data"""
+
+
+class NgramAssocMeasures(metaclass=ABCMeta):
+    """
+    An abstract class defining a collection of generic association measures.
+    Each public method returns a score, taking the following arguments::
+
+        score_fn(count_of_ngram,
+                 (count_of_n-1gram_1, ..., count_of_n-1gram_j),
+                 (count_of_n-2gram_1, ..., count_of_n-2gram_k),
+                 ...,
+                 (count_of_1gram_1, ..., count_of_1gram_n),
+                 count_of_total_words)
+
+    See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``
+
+    Inheriting classes should define a property _n, and a method _contingency
+    which calculates contingency values from marginals in order for all
+    association measures defined here to be usable.
+    """
+
+    _n = 0
+
+    @staticmethod
+    @abstractmethod
+    def _contingency(*marginals):
+        """Calculates values of a contingency table from marginal values."""
+        raise NotImplementedError(
+            "The contingency table is not available" "in the general ngram case"
+        )
+
+    @staticmethod
+    @abstractmethod
+    def _marginals(*contingency):
+        """Calculates values of contingency table marginals from its values."""
+        raise NotImplementedError(
+            "The contingency table is not available" "in the general ngram case"
+        )
+
+    @classmethod
+    def _expected_values(cls, cont):
+        """Calculates expected values for a contingency table."""
+        n_all = sum(cont)
+        bits = [1 << i for i in range(cls._n)]
+
+        # For each contingency table cell
+        for i in range(len(cont)):
+            # Yield the expected value
+            yield (
+                _product(
+                    sum(cont[x] for x in range(2**cls._n) if (x & j) == (i & j))
+                    for j in bits
+                )
+                / (n_all ** (cls._n - 1))
+            )
+
+    @staticmethod
+    def raw_freq(*marginals):
+        """Scores ngrams by their frequency"""
+        return marginals[NGRAM] / marginals[TOTAL]
+
+    @classmethod
+    def student_t(cls, *marginals):
+        """Scores ngrams using Student's t test with independence hypothesis
+        for unigrams, as in Manning and Schutze 5.3.1.
+        """
+        return (
+            marginals[NGRAM]
+            - _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
+        ) / (marginals[NGRAM] + _SMALL) ** 0.5
+
+    @classmethod
+    def chi_sq(cls, *marginals):
+        """Scores ngrams using Pearson's chi-square as in Manning and Schutze
+        5.3.3.
+        """
+        cont = cls._contingency(*marginals)
+        exps = cls._expected_values(cont)
+        return sum((obs - exp) ** 2 / (exp + _SMALL) for obs, exp in zip(cont, exps))
+
+    @staticmethod
+    def mi_like(*marginals, **kwargs):
+        """Scores ngrams using a variant of mutual information. The keyword
+        argument power sets an exponent (default 3) for the numerator. No
+        logarithm of the result is calculated.
+        """
+        return marginals[NGRAM] ** kwargs.get("power", 3) / _product(
+            marginals[UNIGRAMS]
+        )
+
+    @classmethod
+    def pmi(cls, *marginals):
+        """Scores ngrams by pointwise mutual information, as in Manning and
+        Schutze 5.4.
+        """
+        return _log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) - _log2(
+            _product(marginals[UNIGRAMS])
+        )
+
+    @classmethod
+    def likelihood_ratio(cls, *marginals):
+        """Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4."""
+        cont = cls._contingency(*marginals)
+        return 2 * sum(
+            obs * _ln(obs / (exp + _SMALL) + _SMALL)
+            for obs, exp in zip(cont, cls._expected_values(cont))
+        )
+
+    @classmethod
+    def poisson_stirling(cls, *marginals):
+        """Scores ngrams using the Poisson-Stirling measure."""
+        exp = _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
+        return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1)
+
+    @classmethod
+    def jaccard(cls, *marginals):
+        """Scores ngrams using the Jaccard index."""
+        cont = cls._contingency(*marginals)
+        return cont[0] / sum(cont[:-1])
+
+
+class BigramAssocMeasures(NgramAssocMeasures):
+    """
+    A collection of bigram association measures. Each association measure
+    is provided as a function with three arguments::
+
+        bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)
+
+    The arguments constitute the marginals of a contingency table, counting
+    the occurrences of particular events in a corpus. The letter i in the
+    suffix refers to the appearance of the word in question, while x indicates
+    the appearance of any word. Thus, for example:
+
+    - n_ii counts ``(w1, w2)``, i.e. the bigram being scored
+    - n_ix counts ``(w1, *)``
+    - n_xi counts ``(*, w2)``
+    - n_xx counts ``(*, *)``, i.e. any bigram
+
+    This may be shown with respect to a contingency table::
+
+                w1    ~w1
+             ------ ------
+         w2 | n_ii | n_oi | = n_xi
+             ------ ------
+        ~w2 | n_io | n_oo |
+             ------ ------
+             = n_ix        TOTAL = n_xx
+    """
+
+    _n = 2
+
+    @staticmethod
+    def _contingency(n_ii, n_ix_xi_tuple, n_xx):
+        """Calculates values of a bigram contingency table from marginal values."""
+        (n_ix, n_xi) = n_ix_xi_tuple
+        n_oi = n_xi - n_ii
+        n_io = n_ix - n_ii
+        return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io)
+
+    @staticmethod
+    def _marginals(n_ii, n_oi, n_io, n_oo):
+        """Calculates values of contingency table marginals from its values."""
+        return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii)
+
+    @staticmethod
+    def _expected_values(cont):
+        """Calculates expected values for a contingency table."""
+        n_xx = sum(cont)
+        # For each contingency table cell
+        for i in range(4):
+            yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / n_xx
+
+    @classmethod
+    def phi_sq(cls, *marginals):
+        """Scores bigrams using phi-square, the square of the Pearson correlation
+        coefficient.
+        """
+        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
+
+        return (n_ii * n_oo - n_io * n_oi) ** 2 / (
+            (n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)
+        )
+
+    @classmethod
+    def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx):
+        """Scores bigrams using chi-square, i.e. phi-sq multiplied by the number
+        of bigrams, as in Manning and Schutze 5.3.3.
+        """
+        (n_ix, n_xi) = n_ix_xi_tuple
+        return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx)
+
+    @classmethod
+    def fisher(cls, *marginals):
+        """Scores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
+        sensitive to small counts than PMI or Chi Sq, but also more expensive
+        to compute. Requires scipy.
+        """
+
+        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
+
+        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative="less")
+        return pvalue
+
+    @staticmethod
+    def dice(n_ii, n_ix_xi_tuple, n_xx):
+        """Scores bigrams using Dice's coefficient."""
+        (n_ix, n_xi) = n_ix_xi_tuple
+        return 2 * n_ii / (n_ix + n_xi)
+
+
+class TrigramAssocMeasures(NgramAssocMeasures):
+    """
+    A collection of trigram association measures. Each association measure
+    is provided as a function with four arguments::
+
+        trigram_score_fn(n_iii,
+                         (n_iix, n_ixi, n_xii),
+                         (n_ixx, n_xix, n_xxi),
+                         n_xxx)
+
+    The arguments constitute the marginals of a contingency table, counting
+    the occurrences of particular events in a corpus. The letter i in the
+    suffix refers to the appearance of the word in question, while x indicates
+    the appearance of any word. Thus, for example:
+
+    - n_iii counts ``(w1, w2, w3)``, i.e. the trigram being scored
+    - n_ixx counts ``(w1, *, *)``
+    - n_xxx counts ``(*, *, *)``, i.e. any trigram
+    """
+
+    _n = 3
+
+    @staticmethod
+    def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx):
+        """Calculates values of a trigram contingency table (or cube) from
+        marginal values.
+        >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
+        (1, 0, 0, 0, 0, 72, 0, 1927)
+        """
+        (n_iix, n_ixi, n_xii) = n_iix_tuple
+        (n_ixx, n_xix, n_xxi) = n_ixx_tuple
+        n_oii = n_xii - n_iii
+        n_ioi = n_ixi - n_iii
+        n_iio = n_iix - n_iii
+        n_ooi = n_xxi - n_iii - n_oii - n_ioi
+        n_oio = n_xix - n_iii - n_oii - n_iio
+        n_ioo = n_ixx - n_iii - n_ioi - n_iio
+        n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo
+
+        return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo)
+
+    @staticmethod
+    def _marginals(*contingency):
+        """Calculates values of contingency table marginals from its values.
+        >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
+        (1, (1, 1, 1), (1, 73, 1), 2000)
+        """
+        n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency
+        return (
+            n_iii,
+            (n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii),
+            (
+                n_iii + n_ioi + n_iio + n_ioo,
+                n_iii + n_oii + n_iio + n_oio,
+                n_iii + n_oii + n_ioi + n_ooi,
+            ),
+            sum(contingency),
+        )
+
+
+class QuadgramAssocMeasures(NgramAssocMeasures):
+    """
+    A collection of quadgram association measures. Each association measure
+    is provided as a function with five arguments::
+
+        trigram_score_fn(n_iiii,
+                        (n_iiix, n_iixi, n_ixii, n_xiii),
+                        (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
+                        (n_ixxx, n_xixx, n_xxix, n_xxxi),
+                        n_all)
+
+    The arguments constitute the marginals of a contingency table, counting
+    the occurrences of particular events in a corpus. The letter i in the
+    suffix refers to the appearance of the word in question, while x indicates
+    the appearance of any word. Thus, for example:
+
+    - n_iiii counts ``(w1, w2, w3, w4)``, i.e. the quadgram being scored
+    - n_ixxi counts ``(w1, *, *, w4)``
+    - n_xxxx counts ``(*, *, *, *)``, i.e. any quadgram
+    """
+
+    _n = 4
+
+    @staticmethod
+    def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx):
+        """Calculates values of a quadgram contingency table from
+        marginal values.
+        """
+        (n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple
+        (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple
+        (n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple
+        n_oiii = n_xiii - n_iiii
+        n_ioii = n_ixii - n_iiii
+        n_iioi = n_iixi - n_iiii
+        n_ooii = n_xxii - n_iiii - n_oiii - n_ioii
+        n_oioi = n_xixi - n_iiii - n_oiii - n_iioi
+        n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi
+        n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi
+        n_iiio = n_iiix - n_iiii
+        n_oiio = n_xiix - n_iiii - n_oiii - n_iiio
+        n_ioio = n_ixix - n_iiii - n_ioii - n_iiio
+        n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio
+        n_iioo = n_iixx - n_iiii - n_iioi - n_iiio
+        n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo
+        n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio
+        n_oooo = (
+            n_xxxx
+            - n_iiii
+            - n_oiii
+            - n_ioii
+            - n_iioi
+            - n_ooii
+            - n_oioi
+            - n_iooi
+            - n_oooi
+            - n_iiio
+            - n_oiio
+            - n_ioio
+            - n_ooio
+            - n_iioo
+            - n_oioo
+            - n_iooo
+        )
+
+        return (
+            n_iiii,
+            n_oiii,
+            n_ioii,
+            n_ooii,
+            n_iioi,
+            n_oioi,
+            n_iooi,
+            n_oooi,
+            n_iiio,
+            n_oiio,
+            n_ioio,
+            n_ooio,
+            n_iioo,
+            n_oioo,
+            n_iooo,
+            n_oooo,
+        )
+
+    @staticmethod
+    def _marginals(*contingency):
+        """Calculates values of contingency table marginals from its values.
+        QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
+        (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
+        """
+        (
+            n_iiii,
+            n_oiii,
+            n_ioii,
+            n_ooii,
+            n_iioi,
+            n_oioi,
+            n_iooi,
+            n_oooi,
+            n_iiio,
+            n_oiio,
+            n_ioio,
+            n_ooio,
+            n_iioo,
+            n_oioo,
+            n_iooo,
+            n_oooo,
+        ) = contingency
+
+        n_iiix = n_iiii + n_iiio
+        n_iixi = n_iiii + n_iioi
+        n_ixii = n_iiii + n_ioii
+        n_xiii = n_iiii + n_oiii
+
+        n_iixx = n_iiii + n_iioi + n_iiio + n_iioo
+        n_ixix = n_iiii + n_ioii + n_iiio + n_ioio
+        n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi
+        n_xixi = n_iiii + n_oiii + n_iioi + n_oioi
+        n_xxii = n_iiii + n_oiii + n_ioii + n_ooii
+        n_xiix = n_iiii + n_oiii + n_iiio + n_oiio
+
+        n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo
+        n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo
+        n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio
+        n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi
+
+        n_all = sum(contingency)
+
+        return (
+            n_iiii,
+            (n_iiix, n_iixi, n_ixii, n_xiii),
+            (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
+            (n_ixxx, n_xixx, n_xxix, n_xxxi),
+            n_all,
+        )
+
+
+class ContingencyMeasures:
+    """Wraps NgramAssocMeasures classes such that the arguments of association
+    measures are contingency table values rather than marginals.
+    """
+
+    def __init__(self, measures):
+        """Constructs a ContingencyMeasures given a NgramAssocMeasures class"""
+        self.__class__.__name__ = "Contingency" + measures.__class__.__name__
+        for k in dir(measures):
+            if k.startswith("__"):
+                continue
+            v = getattr(measures, k)
+            if not k.startswith("_"):
+                v = self._make_contingency_fn(measures, v)
+            setattr(self, k, v)
+
+    @staticmethod
+    def _make_contingency_fn(measures, old_fn):
+        """From an association measure function, produces a new function which
+        accepts contingency table values as its arguments.
+        """
+
+        def res(*contingency):
+            return old_fn(*measures._marginals(*contingency))
+
+        res.__doc__ = old_fn.__doc__
+        res.__name__ = old_fn.__name__
+        return res
--- a/backend/venv/Lib/site-packages/nltk/metrics/confusionmatrix.py
+++ b/backend/venv/Lib/site-packages/nltk/metrics/confusionmatrix.py
@@ -0,0 +1,351 @@
+# Natural Language Toolkit: Confusion Matrices
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Tom Aarsen <>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+from nltk.probability import FreqDist
+
+
+class ConfusionMatrix:
+    """
+    The confusion matrix between a list of reference values and a
+    corresponding list of test values.  Entry *[r,t]* of this
+    matrix is a count of the number of times that the reference value
+    *r* corresponds to the test value *t*.  E.g.:
+
+        >>> from nltk.metrics import ConfusionMatrix
+        >>> ref  = 'DET NN VB DET JJ NN NN IN DET NN'.split()
+        >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split()
+        >>> cm = ConfusionMatrix(ref, test)
+        >>> print(cm['NN', 'NN'])
+        3
+
+    Note that the diagonal entries *Ri=Tj* of this matrix
+    corresponds to correct values; and the off-diagonal entries
+    correspond to incorrect values.
+    """
+
+    def __init__(self, reference, test, sort_by_count=False):
+        """
+        Construct a new confusion matrix from a list of reference
+        values and a corresponding list of test values.
+
+        :type reference: list
+        :param reference: An ordered list of reference values.
+        :type test: list
+        :param test: A list of values to compare against the
+            corresponding reference values.
+        :raise ValueError: If ``reference`` and ``length`` do not have
+            the same length.
+        """
+        if len(reference) != len(test):
+            raise ValueError("Lists must have the same length.")
+
+        # Get a list of all values.
+        if sort_by_count:
+            ref_fdist = FreqDist(reference)
+            test_fdist = FreqDist(test)
+
+            def key(v):
+                return -(ref_fdist[v] + test_fdist[v])
+
+            values = sorted(set(reference + test), key=key)
+        else:
+            values = sorted(set(reference + test))
+
+        # Construct a value->index dictionary
+        indices = {val: i for (i, val) in enumerate(values)}
+
+        # Make a confusion matrix table.
+        confusion = [[0 for _ in values] for _ in values]
+        max_conf = 0  # Maximum confusion
+        for w, g in zip(reference, test):
+            confusion[indices[w]][indices[g]] += 1
+            max_conf = max(max_conf, confusion[indices[w]][indices[g]])
+
+        #: A list of all values in ``reference`` or ``test``.
+        self._values = values
+        #: A dictionary mapping values in ``self._values`` to their indices.
+        self._indices = indices
+        #: The confusion matrix itself (as a list of lists of counts).
+        self._confusion = confusion
+        #: The greatest count in ``self._confusion`` (used for printing).
+        self._max_conf = max_conf
+        #: The total number of values in the confusion matrix.
+        self._total = len(reference)
+        #: The number of correct (on-diagonal) values in the matrix.
+        self._correct = sum(confusion[i][i] for i in range(len(values)))
+
+    def __getitem__(self, li_lj_tuple):
+        """
+        :return: The number of times that value ``li`` was expected and
+        value ``lj`` was given.
+        :rtype: int
+        """
+        (li, lj) = li_lj_tuple
+        i = self._indices[li]
+        j = self._indices[lj]
+        return self._confusion[i][j]
+
+    def __repr__(self):
+        return f"<ConfusionMatrix: {self._correct}/{self._total} correct>"
+
+    def __str__(self):
+        return self.pretty_format()
+
+    def pretty_format(
+        self,
+        show_percents=False,
+        values_in_chart=True,
+        truncate=None,
+        sort_by_count=False,
+    ):
+        """
+        :return: A multi-line string representation of this confusion matrix.
+        :type truncate: int
+        :param truncate: If specified, then only show the specified
+            number of values.  Any sorting (e.g., sort_by_count)
+            will be performed before truncation.
+        :param sort_by_count: If true, then sort by the count of each
+            label in the reference data.  I.e., labels that occur more
+            frequently in the reference label will be towards the left
+            edge of the matrix, and labels that occur less frequently
+            will be towards the right edge.
+
+        @todo: add marginals?
+        """
+        confusion = self._confusion
+
+        values = self._values
+        if sort_by_count:
+            values = sorted(
+                values, key=lambda v: -sum(self._confusion[self._indices[v]])
+            )
+
+        if truncate:
+            values = values[:truncate]
+
+        if values_in_chart:
+            value_strings = ["%s" % val for val in values]
+        else:
+            value_strings = [str(n + 1) for n in range(len(values))]
+
+        # Construct a format string for row values
+        valuelen = max(len(val) for val in value_strings)
+        value_format = "%" + repr(valuelen) + "s | "
+        # Construct a format string for matrix entries
+        if show_percents:
+            entrylen = 6
+            entry_format = "%5.1f%%"
+            zerostr = "     ."
+        else:
+            entrylen = len(repr(self._max_conf))
+            entry_format = "%" + repr(entrylen) + "d"
+            zerostr = " " * (entrylen - 1) + "."
+
+        # Write the column values.
+        s = ""
+        for i in range(valuelen):
+            s += (" " * valuelen) + " |"
+            for val in value_strings:
+                if i >= valuelen - len(val):
+                    s += val[i - valuelen + len(val)].rjust(entrylen + 1)
+                else:
+                    s += " " * (entrylen + 1)
+            s += " |\n"
+
+        # Write a dividing line
+        s += "{}-+-{}+\n".format("-" * valuelen, "-" * ((entrylen + 1) * len(values)))
+
+        # Write the entries.
+        for val, li in zip(value_strings, values):
+            i = self._indices[li]
+            s += value_format % val
+            for lj in values:
+                j = self._indices[lj]
+                if confusion[i][j] == 0:
+                    s += zerostr
+                elif show_percents:
+                    s += entry_format % (100.0 * confusion[i][j] / self._total)
+                else:
+                    s += entry_format % confusion[i][j]
+                if i == j:
+                    prevspace = s.rfind(" ")
+                    s = s[:prevspace] + "<" + s[prevspace + 1 :] + ">"
+                else:
+                    s += " "
+            s += "|\n"
+
+        # Write a dividing line
+        s += "{}-+-{}+\n".format("-" * valuelen, "-" * ((entrylen + 1) * len(values)))
+
+        # Write a key
+        s += "(row = reference; col = test)\n"
+        if not values_in_chart:
+            s += "Value key:\n"
+            for i, value in enumerate(values):
+                s += "%6d: %s\n" % (i + 1, value)
+
+        return s
+
+    def key(self):
+        values = self._values
+        str = "Value key:\n"
+        indexlen = len(repr(len(values) - 1))
+        key_format = "  %" + repr(indexlen) + "d: %s\n"
+        str += "".join([key_format % (i, values[i]) for i in range(len(values))])
+        return str
+
+    def recall(self, value):
+        """Given a value in the confusion matrix, return the recall
+        that corresponds to this value. The recall is defined as:
+
+        - *r* = true positive / (true positive + false positive)
+
+        and can loosely be considered the ratio of how often ``value``
+        was predicted correctly relative to how often ``value`` was
+        the true result.
+
+        :param value: value used in the ConfusionMatrix
+        :return: the recall corresponding to ``value``.
+        :rtype: float
+        """
+        # Number of times `value` was correct, and also predicted
+        TP = self[value, value]
+        # Number of times `value` was correct
+        TP_FN = sum(self[value, pred_value] for pred_value in self._values)
+        if TP_FN == 0:
+            return 0.0
+        return TP / TP_FN
+
+    def precision(self, value):
+        """Given a value in the confusion matrix, return the precision
+        that corresponds to this value. The precision is defined as:
+
+        - *p* = true positive / (true positive + false negative)
+
+        and can loosely be considered the ratio of how often ``value``
+        was predicted correctly relative to the number of predictions
+        for ``value``.
+
+        :param value: value used in the ConfusionMatrix
+        :return: the precision corresponding to ``value``.
+        :rtype: float
+        """
+        # Number of times `value` was correct, and also predicted
+        TP = self[value, value]
+        # Number of times `value` was predicted
+        TP_FP = sum(self[real_value, value] for real_value in self._values)
+        if TP_FP == 0:
+            return 0.0
+        return TP / TP_FP
+
+    def f_measure(self, value, alpha=0.5):
+        """
+        Given a value used in the confusion matrix, return the f-measure
+        that corresponds to this value. The f-measure is the harmonic mean
+        of the ``precision`` and ``recall``, weighted by ``alpha``.
+        In particular, given the precision *p* and recall *r* defined by:
+
+        - *p* = true positive / (true positive + false negative)
+        - *r* = true positive / (true positive + false positive)
+
+        The f-measure is:
+
+        - *1/(alpha/p + (1-alpha)/r)*
+
+        With ``alpha = 0.5``, this reduces to:
+
+        - *2pr / (p + r)*
+
+        :param value: value used in the ConfusionMatrix
+        :param alpha: Ratio of the cost of false negative compared to false
+            positives. Defaults to 0.5, where the costs are equal.
+        :type alpha: float
+        :return: the F-measure corresponding to ``value``.
+        :rtype: float
+        """
+        p = self.precision(value)
+        r = self.recall(value)
+        if p == 0.0 or r == 0.0:
+            return 0.0
+        return 1.0 / (alpha / p + (1 - alpha) / r)
+
+    def evaluate(self, alpha=0.5, truncate=None, sort_by_count=False):
+        """
+        Tabulate the **recall**, **precision** and **f-measure**
+        for each value in this confusion matrix.
+
+        >>> reference = "DET NN VB DET JJ NN NN IN DET NN".split()
+        >>> test = "DET VB VB DET NN NN NN IN DET NN".split()
+        >>> cm = ConfusionMatrix(reference, test)
+        >>> print(cm.evaluate())
+        Tag | Prec.  | Recall | F-measure
+        ----+--------+--------+-----------
+        DET | 1.0000 | 1.0000 | 1.0000
+         IN | 1.0000 | 1.0000 | 1.0000
+         JJ | 0.0000 | 0.0000 | 0.0000
+         NN | 0.7500 | 0.7500 | 0.7500
+         VB | 0.5000 | 1.0000 | 0.6667
+        <BLANKLINE>
+
+        :param alpha: Ratio of the cost of false negative compared to false
+            positives, as used in the f-measure computation. Defaults to 0.5,
+            where the costs are equal.
+        :type alpha: float
+        :param truncate: If specified, then only show the specified
+            number of values. Any sorting (e.g., sort_by_count)
+            will be performed before truncation. Defaults to None
+        :type truncate: int, optional
+        :param sort_by_count: Whether to sort the outputs on frequency
+            in the reference label. Defaults to False.
+        :type sort_by_count: bool, optional
+        :return: A tabulated recall, precision and f-measure string
+        :rtype: str
+        """
+        tags = self._values
+
+        # Apply keyword parameters
+        if sort_by_count:
+            tags = sorted(tags, key=lambda v: -sum(self._confusion[self._indices[v]]))
+        if truncate:
+            tags = tags[:truncate]
+
+        tag_column_len = max(max(len(tag) for tag in tags), 3)
+
+        # Construct the header
+        s = (
+            f"{' ' * (tag_column_len - 3)}Tag | Prec.  | Recall | F-measure\n"
+            f"{'-' * tag_column_len}-+--------+--------+-----------\n"
+        )
+
+        # Construct the body
+        for tag in tags:
+            s += (
+                f"{tag:>{tag_column_len}} | "
+                f"{self.precision(tag):<6.4f} | "
+                f"{self.recall(tag):<6.4f} | "
+                f"{self.f_measure(tag, alpha=alpha):.4f}\n"
+            )
+
+        return s
+
+
+def demo():
+    reference = "DET NN VB DET JJ NN NN IN DET NN".split()
+    test = "DET VB VB DET NN NN NN IN DET NN".split()
+    print("Reference =", reference)
+    print("Test    =", test)
+    print("Confusion matrix:")
+    print(ConfusionMatrix(reference, test))
+    print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True))
+
+    print(ConfusionMatrix(reference, test).recall("VB"))
+
+
+if __name__ == "__main__":
+    demo()
--- a/backend/venv/Lib/site-packages/nltk/metrics/distance.py
+++ b/backend/venv/Lib/site-packages/nltk/metrics/distance.py
@@ -0,0 +1,508 @@
+# Natural Language Toolkit: Distance Metrics
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Tom Lippincott <tom@cs.columbia.edu>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+"""
+Distance Metrics.
+
+Compute the distance between two items (usually strings).
+As metrics, they must satisfy the following three requirements:
+
+1. d(a, a) = 0
+2. d(a, b) >= 0
+3. d(a, c) <= d(a, b) + d(b, c)
+"""
+
+import operator
+import warnings
+
+
+def _edit_dist_init(len1, len2):
+    lev = []
+    for i in range(len1):
+        lev.append([0] * len2)  # initialize 2D array to zero
+    for i in range(len1):
+        lev[i][0] = i  # column 0: 0,1,2,3,4,...
+    for j in range(len2):
+        lev[0][j] = j  # row 0: 0,1,2,3,4,...
+    return lev
+
+
+def _last_left_t_init(sigma):
+    return {c: 0 for c in sigma}
+
+
+def _edit_dist_step(
+    lev, i, j, s1, s2, last_left, last_right, substitution_cost=1, transpositions=False
+):
+    c1 = s1[i - 1]
+    c2 = s2[j - 1]
+
+    # skipping a character in s1
+    a = lev[i - 1][j] + 1
+    # skipping a character in s2
+    b = lev[i][j - 1] + 1
+    # substitution
+    c = lev[i - 1][j - 1] + (substitution_cost if c1 != c2 else 0)
+
+    # transposition
+    d = c + 1  # never picked by default
+    if transpositions and last_left > 0 and last_right > 0:
+        d = lev[last_left - 1][last_right - 1] + i - last_left + j - last_right - 1
+
+    # pick the cheapest
+    lev[i][j] = min(a, b, c, d)
+
+
+def edit_distance(s1, s2, substitution_cost=1, transpositions=False):
+    """
+    Calculate the Levenshtein edit-distance between two strings.
+    The edit distance is the number of characters that need to be
+    substituted, inserted, or deleted, to transform s1 into s2.  For
+    example, transforming "rain" to "shine" requires three steps,
+    consisting of two substitutions and one insertion:
+    "rain" -> "sain" -> "shin" -> "shine".  These operations could have
+    been done in other orders, but at least three steps are needed.
+
+    Allows specifying the cost of substitution edits (e.g., "a" -> "b"),
+    because sometimes it makes sense to assign greater penalties to
+    substitutions.
+
+    This also optionally allows transposition edits (e.g., "ab" -> "ba"),
+    though this is disabled by default.
+
+    :param s1, s2: The strings to be analysed
+    :param transpositions: Whether to allow transposition edits
+    :type s1: str
+    :type s2: str
+    :type substitution_cost: int
+    :type transpositions: bool
+    :rtype: int
+    """
+    # set up a 2-D array
+    len1 = len(s1)
+    len2 = len(s2)
+    lev = _edit_dist_init(len1 + 1, len2 + 1)
+
+    # retrieve alphabet
+    sigma = set()
+    sigma.update(s1)
+    sigma.update(s2)
+
+    # set up table to remember positions of last seen occurrence in s1
+    last_left_t = _last_left_t_init(sigma)
+
+    # iterate over the array
+    # i and j start from 1 and not 0 to stay close to the wikipedia pseudo-code
+    # see https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
+    for i in range(1, len1 + 1):
+        last_right_buf = 0
+        for j in range(1, len2 + 1):
+            last_left = last_left_t[s2[j - 1]]
+            last_right = last_right_buf
+            if s1[i - 1] == s2[j - 1]:
+                last_right_buf = j
+            _edit_dist_step(
+                lev,
+                i,
+                j,
+                s1,
+                s2,
+                last_left,
+                last_right,
+                substitution_cost=substitution_cost,
+                transpositions=transpositions,
+            )
+        last_left_t[s1[i - 1]] = i
+    return lev[len1][len2]
+
+
+def _edit_dist_backtrace(lev):
+    i, j = len(lev) - 1, len(lev[0]) - 1
+    alignment = [(i, j)]
+
+    while (i, j) != (0, 0):
+        directions = [
+            (i - 1, j - 1),  # substitution
+            (i - 1, j),  # skip s1
+            (i, j - 1),  # skip s2
+        ]
+
+        direction_costs = (
+            (lev[i][j] if (i >= 0 and j >= 0) else float("inf"), (i, j))
+            for i, j in directions
+        )
+        _, (i, j) = min(direction_costs, key=operator.itemgetter(0))
+
+        alignment.append((i, j))
+    return list(reversed(alignment))
+
+
+def edit_distance_align(s1, s2, substitution_cost=1):
+    """
+    Calculate the minimum Levenshtein edit-distance based alignment
+    mapping between two strings. The alignment finds the mapping
+    from string s1 to s2 that minimizes the edit distance cost.
+    For example, mapping "rain" to "shine" would involve 2
+    substitutions, 2 matches and an insertion resulting in
+    the following mapping:
+    [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5)]
+    NB: (0, 0) is the start state without any letters associated
+    See more: https://web.stanford.edu/class/cs124/lec/med.pdf
+
+    In case of multiple valid minimum-distance alignments, the
+    backtrace has the following operation precedence:
+
+    1. Substitute s1 and s2 characters
+    2. Skip s1 character
+    3. Skip s2 character
+
+    The backtrace is carried out in reverse string order.
+
+    This function does not support transposition.
+
+    :param s1, s2: The strings to be aligned
+    :type s1: str
+    :type s2: str
+    :type substitution_cost: int
+    :rtype: List[Tuple(int, int)]
+    """
+    # set up a 2-D array
+    len1 = len(s1)
+    len2 = len(s2)
+    lev = _edit_dist_init(len1 + 1, len2 + 1)
+
+    # iterate over the array
+    for i in range(len1):
+        for j in range(len2):
+            _edit_dist_step(
+                lev,
+                i + 1,
+                j + 1,
+                s1,
+                s2,
+                0,
+                0,
+                substitution_cost=substitution_cost,
+                transpositions=False,
+            )
+
+    # backtrace to find alignment
+    alignment = _edit_dist_backtrace(lev)
+    return alignment
+
+
+def binary_distance(label1, label2):
+    """Simple equality test.
+
+    0.0 if the labels are identical, 1.0 if they are different.
+
+    >>> from nltk.metrics import binary_distance
+    >>> binary_distance(1,1)
+    0.0
+
+    >>> binary_distance(1,3)
+    1.0
+    """
+
+    return 0.0 if label1 == label2 else 1.0
+
+
+def jaccard_distance(label1, label2):
+    """Distance metric comparing set-similarity."""
+    return (len(label1.union(label2)) - len(label1.intersection(label2))) / len(
+        label1.union(label2)
+    )
+
+
+def masi_distance(label1, label2):
+    """Distance metric that takes into account partial agreement when multiple
+    labels are assigned.
+
+    >>> from nltk.metrics import masi_distance
+    >>> masi_distance(set([1, 2]), set([1, 2, 3, 4]))
+    0.665
+
+    Passonneau 2006, Measuring Agreement on Set-Valued Items (MASI)
+    for Semantic and Pragmatic Annotation.
+    """
+
+    len_intersection = len(label1.intersection(label2))
+    len_union = len(label1.union(label2))
+    len_label1 = len(label1)
+    len_label2 = len(label2)
+    if len_label1 == len_label2 and len_label1 == len_intersection:
+        m = 1
+    elif len_intersection == min(len_label1, len_label2):
+        m = 0.67
+    elif len_intersection > 0:
+        m = 0.33
+    else:
+        m = 0
+
+    return 1 - len_intersection / len_union * m
+
+
+def interval_distance(label1, label2):
+    """Krippendorff's interval distance metric
+
+    >>> from nltk.metrics import interval_distance
+    >>> interval_distance(1,10)
+    81
+
+    Krippendorff 1980, Content Analysis: An Introduction to its Methodology
+    """
+
+    try:
+        return pow(label1 - label2, 2)
+    #        return pow(list(label1)[0]-list(label2)[0],2)
+    except:
+        print("non-numeric labels not supported with interval distance")
+
+
+def presence(label):
+    """Higher-order function to test presence of a given label"""
+
+    return lambda x, y: 1.0 * ((label in x) == (label in y))
+
+
+def fractional_presence(label):
+    return (
+        lambda x, y: abs((1.0 / len(x)) - (1.0 / len(y))) * (label in x and label in y)
+        or 0.0 * (label not in x and label not in y)
+        or abs(1.0 / len(x)) * (label in x and label not in y)
+        or (1.0 / len(y)) * (label not in x and label in y)
+    )
+
+
+def custom_distance(file):
+    data = {}
+    with open(file) as infile:
+        for l in infile:
+            labelA, labelB, dist = l.strip().split("\t")
+            labelA = frozenset([labelA])
+            labelB = frozenset([labelB])
+            data[frozenset([labelA, labelB])] = float(dist)
+    return lambda x, y: data[frozenset([x, y])]
+
+
+def jaro_similarity(s1, s2):
+    """
+    Computes the Jaro similarity between 2 sequences from:
+
+        Matthew A. Jaro (1989). Advances in record linkage methodology
+        as applied to the 1985 census of Tampa Florida. Journal of the
+        American Statistical Association. 84 (406): 414-20.
+
+    The Jaro distance between is the min no. of single-character transpositions
+    required to change one word into another. The Jaro similarity formula from
+    https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance :
+
+        ``jaro_sim = 0 if m = 0 else 1/3 * (m/|s_1| + m/s_2 + (m-t)/m)``
+
+    where
+        - `|s_i|` is the length of string `s_i`
+        - `m` is the no. of matching characters
+        - `t` is the half no. of possible transpositions.
+    """
+    # First, store the length of the strings
+    # because they will be re-used several times.
+    len_s1, len_s2 = len(s1), len(s2)
+
+    # The upper bound of the distance for being a matched character.
+    match_bound = max(len_s1, len_s2) // 2 - 1
+
+    # Initialize the counts for matches and transpositions.
+    matches = 0  # no.of matched characters in s1 and s2
+    transpositions = 0  # no. of transpositions between s1 and s2
+    flagged_1 = []  # positions in s1 which are matches to some character in s2
+    flagged_2 = []  # positions in s2 which are matches to some character in s1
+
+    # Iterate through sequences, check for matches and compute transpositions.
+    for i in range(len_s1):  # Iterate through each character.
+        upperbound = min(i + match_bound, len_s2 - 1)
+        lowerbound = max(0, i - match_bound)
+        for j in range(lowerbound, upperbound + 1):
+            if s1[i] == s2[j] and j not in flagged_2:
+                matches += 1
+                flagged_1.append(i)
+                flagged_2.append(j)
+                break
+    flagged_2.sort()
+    for i, j in zip(flagged_1, flagged_2):
+        if s1[i] != s2[j]:
+            transpositions += 1
+
+    if matches == 0:
+        return 0
+    else:
+        return (
+            1
+            / 3
+            * (
+                matches / len_s1
+                + matches / len_s2
+                + (matches - transpositions // 2) / matches
+            )
+        )
+
+
+def jaro_winkler_similarity(s1, s2, p=0.1, max_l=4):
+    """
+    The Jaro Winkler distance is an extension of the Jaro similarity in:
+
+        William E. Winkler. 1990. String Comparator Metrics and Enhanced
+        Decision Rules in the Fellegi-Sunter Model of Record Linkage.
+        Proceedings of the Section on Survey Research Methods.
+        American Statistical Association: 354-359.
+
+    such that:
+
+        jaro_winkler_sim = jaro_sim + ( l * p * (1 - jaro_sim) )
+
+    where,
+
+    - jaro_sim is the output from the Jaro Similarity,
+        see jaro_similarity()
+    - l is the length of common prefix at the start of the string
+        - this implementation provides an upperbound for the l value
+            to keep the prefixes.A common value of this upperbound is 4.
+    - p is the constant scaling factor to overweigh common prefixes.
+        The Jaro-Winkler similarity will fall within the [0, 1] bound,
+        given that max(p)<=0.25 , default is p=0.1 in Winkler (1990)
+
+
+    Test using outputs from https://www.census.gov/srd/papers/pdf/rr93-8.pdf
+    from "Table 5 Comparison of String Comparators Rescaled between 0 and 1"
+
+    >>> winkler_examples = [("billy", "billy"), ("billy", "bill"), ("billy", "blily"),
+    ... ("massie", "massey"), ("yvette", "yevett"), ("billy", "bolly"), ("dwayne", "duane"),
+    ... ("dixon", "dickson"), ("billy", "susan")]
+
+    >>> winkler_scores = [1.000, 0.967, 0.947, 0.944, 0.911, 0.893, 0.858, 0.853, 0.000]
+    >>> jaro_scores =    [1.000, 0.933, 0.933, 0.889, 0.889, 0.867, 0.822, 0.790, 0.000]
+
+    One way to match the values on the Winkler's paper is to provide a different
+    p scaling factor for different pairs of strings, e.g.
+
+    >>> p_factors = [0.1, 0.125, 0.20, 0.125, 0.20, 0.20, 0.20, 0.15, 0.1]
+
+    >>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors):
+    ...     assert round(jaro_similarity(s1, s2), 3) == jscore
+    ...     assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore
+
+
+    Test using outputs from https://www.census.gov/srd/papers/pdf/rr94-5.pdf from
+    "Table 2.1. Comparison of String Comparators Using Last Names, First Names, and Street Names"
+
+    >>> winkler_examples = [('SHACKLEFORD', 'SHACKELFORD'), ('DUNNINGHAM', 'CUNNIGHAM'),
+    ... ('NICHLESON', 'NICHULSON'), ('JONES', 'JOHNSON'), ('MASSEY', 'MASSIE'),
+    ... ('ABROMS', 'ABRAMS'), ('HARDIN', 'MARTINEZ'), ('ITMAN', 'SMITH'),
+    ... ('JERALDINE', 'GERALDINE'), ('MARHTA', 'MARTHA'), ('MICHELLE', 'MICHAEL'),
+    ... ('JULIES', 'JULIUS'), ('TANYA', 'TONYA'), ('DWAYNE', 'DUANE'), ('SEAN', 'SUSAN'),
+    ... ('JON', 'JOHN'), ('JON', 'JAN'), ('BROOKHAVEN', 'BRROKHAVEN'),
+    ... ('BROOK HALLOW', 'BROOK HLLW'), ('DECATUR', 'DECATIR'), ('FITZRUREITER', 'FITZENREITER'),
+    ... ('HIGBEE', 'HIGHEE'), ('HIGBEE', 'HIGVEE'), ('LACURA', 'LOCURA'), ('IOWA', 'IONA'), ('1ST', 'IST')]
+
+    >>> jaro_scores =   [0.970, 0.896, 0.926, 0.790, 0.889, 0.889, 0.722, 0.467, 0.926,
+    ... 0.944, 0.869, 0.889, 0.867, 0.822, 0.783, 0.917, 0.000, 0.933, 0.944, 0.905,
+    ... 0.856, 0.889, 0.889, 0.889, 0.833, 0.000]
+
+    >>> winkler_scores = [0.982, 0.896, 0.956, 0.832, 0.944, 0.922, 0.722, 0.467, 0.926,
+    ... 0.961, 0.921, 0.933, 0.880, 0.858, 0.805, 0.933, 0.000, 0.947, 0.967, 0.943,
+    ... 0.913, 0.922, 0.922, 0.900, 0.867, 0.000]
+
+    One way to match the values on the Winkler's paper is to provide a different
+    p scaling factor for different pairs of strings, e.g.
+
+    >>> p_factors = [0.1, 0.1, 0.1, 0.1, 0.125, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.20,
+    ... 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
+
+
+    >>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors):
+    ...     if (s1, s2) in [('JON', 'JAN'), ('1ST', 'IST')]:
+    ...         continue  # Skip bad examples from the paper.
+    ...     assert round(jaro_similarity(s1, s2), 3) == jscore
+    ...     assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore
+
+
+
+    This test-case proves that the output of Jaro-Winkler similarity depends on
+    the product  l * p and not on the product max_l * p. Here the product max_l * p > 1
+    however the product l * p <= 1
+
+    >>> round(jaro_winkler_similarity('TANYA', 'TONYA', p=0.1, max_l=100), 3)
+    0.88
+    """
+    # To ensure that the output of the Jaro-Winkler's similarity
+    # falls between [0,1], the product of l * p needs to be
+    # also fall between [0,1].
+    if not 0 <= max_l * p <= 1:
+        warnings.warn(
+            str(
+                "The product  `max_l * p` might not fall between [0,1]."
+                "Jaro-Winkler similarity might not be between 0 and 1."
+            )
+        )
+
+    # Compute the Jaro similarity
+    jaro_sim = jaro_similarity(s1, s2)
+
+    # Initialize the upper bound for the no. of prefixes.
+    # if user did not pre-define the upperbound,
+    # use shorter length between s1 and s2
+
+    # Compute the prefix matches.
+    l = 0
+    # zip() will automatically loop until the end of shorter string.
+    for s1_i, s2_i in zip(s1, s2):
+        if s1_i == s2_i:
+            l += 1
+        else:
+            break
+        if l == max_l:
+            break
+    # Return the similarity value as described in docstring.
+    return jaro_sim + (l * p * (1 - jaro_sim))
+
+
+def demo():
+    string_distance_examples = [
+        ("rain", "shine"),
+        ("abcdef", "acbdef"),
+        ("language", "lnaguaeg"),
+        ("language", "lnaugage"),
+        ("language", "lngauage"),
+    ]
+    for s1, s2 in string_distance_examples:
+        print(f"Edit distance btwn '{s1}' and '{s2}':", edit_distance(s1, s2))
+        print(
+            f"Edit dist with transpositions btwn '{s1}' and '{s2}':",
+            edit_distance(s1, s2, transpositions=True),
+        )
+        print(f"Jaro similarity btwn '{s1}' and '{s2}':", jaro_similarity(s1, s2))
+        print(
+            f"Jaro-Winkler similarity btwn '{s1}' and '{s2}':",
+            jaro_winkler_similarity(s1, s2),
+        )
+        print(
+            f"Jaro-Winkler distance btwn '{s1}' and '{s2}':",
+            1 - jaro_winkler_similarity(s1, s2),
+        )
+    s1 = {1, 2, 3, 4}
+    s2 = {3, 4, 5}
+    print("s1:", s1)
+    print("s2:", s2)
+    print("Binary distance:", binary_distance(s1, s2))
+    print("Jaccard distance:", jaccard_distance(s1, s2))
+    print("MASI distance:", masi_distance(s1, s2))
+
+
+if __name__ == "__main__":
+    demo()
--- a/backend/venv/Lib/site-packages/nltk/metrics/paice.py
+++ b/backend/venv/Lib/site-packages/nltk/metrics/paice.py
@@ -0,0 +1,389 @@
+# Natural Language Toolkit: Agreement Metrics
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Lauri Hallila <laurihallila@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+"""Counts Paice's performance statistics for evaluating stemming algorithms.
+
+What is required:
+ - A dictionary of words grouped by their real lemmas
+ - A dictionary of words grouped by stems from a stemming algorithm
+
+When these are given, Understemming Index (UI), Overstemming Index (OI),
+Stemming Weight (SW) and Error-rate relative to truncation (ERRT) are counted.
+
+References:
+Chris D. Paice (1994). An evaluation method for stemming algorithms.
+In Proceedings of SIGIR, 42--50.
+"""
+
+from math import sqrt
+
+
+def get_words_from_dictionary(lemmas):
+    """
+    Get original set of words used for analysis.
+
+    :param lemmas: A dictionary where keys are lemmas and values are sets
+        or lists of words corresponding to that lemma.
+    :type lemmas: dict(str): list(str)
+    :return: Set of words that exist as values in the dictionary
+    :rtype: set(str)
+    """
+    words = set()
+    for lemma in lemmas:
+        words.update(set(lemmas[lemma]))
+    return words
+
+
+def _truncate(words, cutlength):
+    """Group words by stems defined by truncating them at given length.
+
+    :param words: Set of words used for analysis
+    :param cutlength: Words are stemmed by cutting at this length.
+    :type words: set(str) or list(str)
+    :type cutlength: int
+    :return: Dictionary where keys are stems and values are sets of words
+    corresponding to that stem.
+    :rtype: dict(str): set(str)
+    """
+    stems = {}
+    for word in words:
+        stem = word[:cutlength]
+        try:
+            stems[stem].update([word])
+        except KeyError:
+            stems[stem] = {word}
+    return stems
+
+
+# Reference: https://en.wikipedia.org/wiki/Line-line_intersection
+def _count_intersection(l1, l2):
+    """Count intersection between two line segments defined by coordinate pairs.
+
+    :param l1: Tuple of two coordinate pairs defining the first line segment
+    :param l2: Tuple of two coordinate pairs defining the second line segment
+    :type l1: tuple(float, float)
+    :type l2: tuple(float, float)
+    :return: Coordinates of the intersection
+    :rtype: tuple(float, float)
+    """
+    x1, y1 = l1[0]
+    x2, y2 = l1[1]
+    x3, y3 = l2[0]
+    x4, y4 = l2[1]
+
+    denominator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
+
+    if denominator == 0.0:  # lines are parallel
+        if x1 == x2 == x3 == x4 == 0.0:
+            # When lines are parallel, they must be on the y-axis.
+            # We can ignore x-axis because we stop counting the
+            # truncation line when we get there.
+            # There are no other options as UI (x-axis) grows and
+            # OI (y-axis) diminishes when we go along the truncation line.
+            return (0.0, y4)
+
+    x = (
+        (x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)
+    ) / denominator
+    y = (
+        (x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4)
+    ) / denominator
+    return (x, y)
+
+
+def _get_derivative(coordinates):
+    """Get derivative of the line from (0,0) to given coordinates.
+
+    :param coordinates: A coordinate pair
+    :type coordinates: tuple(float, float)
+    :return: Derivative; inf if x is zero
+    :rtype: float
+    """
+    try:
+        return coordinates[1] / coordinates[0]
+    except ZeroDivisionError:
+        return float("inf")
+
+
+def _calculate_cut(lemmawords, stems):
+    """Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.
+
+    :param lemmawords: Set or list of words corresponding to certain lemma.
+    :param stems: A dictionary where keys are stems and values are sets
+    or lists of words corresponding to that stem.
+    :type lemmawords: set(str) or list(str)
+    :type stems: dict(str): set(str)
+    :return: Amount of understemmed and overstemmed pairs contributed by words
+    existing in both lemmawords and stems.
+    :rtype: tuple(float, float)
+    """
+    umt, wmt = 0.0, 0.0
+    for stem in stems:
+        cut = set(lemmawords) & set(stems[stem])
+        if cut:
+            cutcount = len(cut)
+            stemcount = len(stems[stem])
+            # Unachieved merge total
+            umt += cutcount * (len(lemmawords) - cutcount)
+            # Wrongly merged total
+            wmt += cutcount * (stemcount - cutcount)
+    return (umt, wmt)
+
+
+def _calculate(lemmas, stems):
+    """Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.
+
+    :param lemmas: A dictionary where keys are lemmas and values are sets
+    or lists of words corresponding to that lemma.
+    :param stems: A dictionary where keys are stems and values are sets
+    or lists of words corresponding to that stem.
+    :type lemmas: dict(str): list(str)
+    :type stems: dict(str): set(str)
+    :return: Global unachieved merge total (gumt),
+    global desired merge total (gdmt),
+    global wrongly merged total (gwmt) and
+    global desired non-merge total (gdnt).
+    :rtype: tuple(float, float, float, float)
+    """
+
+    n = sum(len(lemmas[word]) for word in lemmas)
+
+    gdmt, gdnt, gumt, gwmt = (0.0, 0.0, 0.0, 0.0)
+
+    for lemma in lemmas:
+        lemmacount = len(lemmas[lemma])
+
+        # Desired merge total
+        gdmt += lemmacount * (lemmacount - 1)
+
+        # Desired non-merge total
+        gdnt += lemmacount * (n - lemmacount)
+
+        # For each (lemma, stem) pair with common words, count how many
+        # pairs are understemmed and overstemmed.
+        umt, wmt = _calculate_cut(lemmas[lemma], stems)
+
+        # Add to total undesired and wrongly-merged totals
+        gumt += umt
+        gwmt += wmt
+
+    # Each object is counted twice, so divide by two
+    return (gumt / 2, gdmt / 2, gwmt / 2, gdnt / 2)
+
+
+def _indexes(gumt, gdmt, gwmt, gdnt):
+    """Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).
+
+    :param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt),
+    global desired merge total (gdmt),
+    global wrongly merged total (gwmt) and
+    global desired non-merge total (gdnt).
+    :type gumt, gdmt, gwmt, gdnt: float
+    :return: Understemming Index (UI),
+    Overstemming Index (OI) and
+    Stemming Weight (SW).
+    :rtype: tuple(float, float, float)
+    """
+    # Calculate Understemming Index (UI),
+    # Overstemming Index (OI) and Stemming Weight (SW)
+    try:
+        ui = gumt / gdmt
+    except ZeroDivisionError:
+        # If GDMT (max merge total) is 0, define UI as 0
+        ui = 0.0
+    try:
+        oi = gwmt / gdnt
+    except ZeroDivisionError:
+        # IF GDNT (max non-merge total) is 0, define OI as 0
+        oi = 0.0
+    try:
+        sw = oi / ui
+    except ZeroDivisionError:
+        if oi == 0.0:
+            # OI and UI are 0, define SW as 'not a number'
+            sw = float("nan")
+        else:
+            # UI is 0, define SW as infinity
+            sw = float("inf")
+    return (ui, oi, sw)
+
+
+class Paice:
+    """Class for storing lemmas, stems and evaluation metrics."""
+
+    def __init__(self, lemmas, stems):
+        """
+        :param lemmas: A dictionary where keys are lemmas and values are sets
+            or lists of words corresponding to that lemma.
+        :param stems: A dictionary where keys are stems and values are sets
+            or lists of words corresponding to that stem.
+        :type lemmas: dict(str): list(str)
+        :type stems: dict(str): set(str)
+        """
+        self.lemmas = lemmas
+        self.stems = stems
+        self.coords = []
+        self.gumt, self.gdmt, self.gwmt, self.gdnt = (None, None, None, None)
+        self.ui, self.oi, self.sw = (None, None, None)
+        self.errt = None
+        self.update()
+
+    def __str__(self):
+        text = ["Global Unachieved Merge Total (GUMT): %s\n" % self.gumt]
+        text.append("Global Desired Merge Total (GDMT): %s\n" % self.gdmt)
+        text.append("Global Wrongly-Merged Total (GWMT): %s\n" % self.gwmt)
+        text.append("Global Desired Non-merge Total (GDNT): %s\n" % self.gdnt)
+        text.append("Understemming Index (GUMT / GDMT): %s\n" % self.ui)
+        text.append("Overstemming Index (GWMT / GDNT): %s\n" % self.oi)
+        text.append("Stemming Weight (OI / UI): %s\n" % self.sw)
+        text.append("Error-Rate Relative to Truncation (ERRT): %s\r\n" % self.errt)
+        coordinates = " ".join(["(%s, %s)" % item for item in self.coords])
+        text.append("Truncation line: %s" % coordinates)
+        return "".join(text)
+
+    def _get_truncation_indexes(self, words, cutlength):
+        """Count (UI, OI) when stemming is done by truncating words at \'cutlength\'.
+
+        :param words: Words used for the analysis
+        :param cutlength: Words are stemmed by cutting them at this length
+        :type words: set(str) or list(str)
+        :type cutlength: int
+        :return: Understemming and overstemming indexes
+        :rtype: tuple(int, int)
+        """
+
+        truncated = _truncate(words, cutlength)
+        gumt, gdmt, gwmt, gdnt = _calculate(self.lemmas, truncated)
+        ui, oi = _indexes(gumt, gdmt, gwmt, gdnt)[:2]
+        return (ui, oi)
+
+    def _get_truncation_coordinates(self, cutlength=0):
+        """Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.
+
+        :param cutlength: Optional parameter to start counting from (ui, oi)
+        coordinates gotten by stemming at this length. Useful for speeding up
+        the calculations when you know the approximate location of the
+        intersection.
+        :type cutlength: int
+        :return: List of coordinate pairs that define the truncation line
+        :rtype: list(tuple(float, float))
+        """
+        words = get_words_from_dictionary(self.lemmas)
+        maxlength = max(len(word) for word in words)
+
+        # Truncate words from different points until (0, 0) - (ui, oi) segment crosses the truncation line
+        coords = []
+        while cutlength <= maxlength:
+            # Get (UI, OI) pair of current truncation point
+            pair = self._get_truncation_indexes(words, cutlength)
+
+            # Store only new coordinates so we'll have an actual
+            # line segment when counting the intersection point
+            if pair not in coords:
+                coords.append(pair)
+            if pair == (0.0, 0.0):
+                # Stop counting if truncation line goes through origo;
+                # length from origo to truncation line is 0
+                return coords
+            if len(coords) >= 2 and pair[0] > 0.0:
+                derivative1 = _get_derivative(coords[-2])
+                derivative2 = _get_derivative(coords[-1])
+                # Derivative of the truncation line is a decreasing value;
+                # when it passes Stemming Weight, we've found the segment
+                # of truncation line intersecting with (0, 0) - (ui, oi) segment
+                if derivative1 >= self.sw >= derivative2:
+                    return coords
+            cutlength += 1
+        return coords
+
+    def _errt(self):
+        """Count Error-Rate Relative to Truncation (ERRT).
+
+        :return: ERRT, length of the line from origo to (UI, OI) divided by
+        the length of the line from origo to the point defined by the same
+        line when extended until the truncation line.
+        :rtype: float
+        """
+        # Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line
+        self.coords = self._get_truncation_coordinates()
+        if (0.0, 0.0) in self.coords:
+            # Truncation line goes through origo, so ERRT cannot be counted
+            if (self.ui, self.oi) != (0.0, 0.0):
+                return float("inf")
+            else:
+                return float("nan")
+        if (self.ui, self.oi) == (0.0, 0.0):
+            # (ui, oi) is origo; define errt as 0.0
+            return 0.0
+        # Count the intersection point
+        # Note that (self.ui, self.oi) cannot be (0.0, 0.0) and self.coords has different coordinates
+        # so we have actual line segments instead of a line segment and a point
+        intersection = _count_intersection(
+            ((0, 0), (self.ui, self.oi)), self.coords[-2:]
+        )
+        # Count OP (length of the line from origo to (ui, oi))
+        op = sqrt(self.ui**2 + self.oi**2)
+        # Count OT (length of the line from origo to truncation line that goes through (ui, oi))
+        ot = sqrt(intersection[0] ** 2 + intersection[1] ** 2)
+        # OP / OT tells how well the stemming algorithm works compared to just truncating words
+        return op / ot
+
+    def update(self):
+        """Update statistics after lemmas and stems have been set."""
+        self.gumt, self.gdmt, self.gwmt, self.gdnt = _calculate(self.lemmas, self.stems)
+        self.ui, self.oi, self.sw = _indexes(self.gumt, self.gdmt, self.gwmt, self.gdnt)
+        self.errt = self._errt()
+
+
+def demo():
+    """Demonstration of the module."""
+    # Some words with their real lemmas
+    lemmas = {
+        "kneel": ["kneel", "knelt"],
+        "range": ["range", "ranged"],
+        "ring": ["ring", "rang", "rung"],
+    }
+    # Same words with stems from a stemming algorithm
+    stems = {
+        "kneel": ["kneel"],
+        "knelt": ["knelt"],
+        "rang": ["rang", "range", "ranged"],
+        "ring": ["ring"],
+        "rung": ["rung"],
+    }
+    print("Words grouped by their lemmas:")
+    for lemma in sorted(lemmas):
+        print("{} => {}".format(lemma, " ".join(lemmas[lemma])))
+    print()
+    print("Same words grouped by a stemming algorithm:")
+    for stem in sorted(stems):
+        print("{} => {}".format(stem, " ".join(stems[stem])))
+    print()
+    p = Paice(lemmas, stems)
+    print(p)
+    print()
+    # Let's "change" results from a stemming algorithm
+    stems = {
+        "kneel": ["kneel"],
+        "knelt": ["knelt"],
+        "rang": ["rang"],
+        "range": ["range", "ranged"],
+        "ring": ["ring"],
+        "rung": ["rung"],
+    }
+    print("Counting stats after changing stemming results:")
+    for stem in sorted(stems):
+        print("{} => {}".format(stem, " ".join(stems[stem])))
+    print()
+    p.stems = stems
+    p.update()
+    print(p)
+
+
+if __name__ == "__main__":
+    demo()
--- a/backend/venv/Lib/site-packages/nltk/metrics/scores.py
+++ b/backend/venv/Lib/site-packages/nltk/metrics/scores.py
@@ -0,0 +1,228 @@
+# Natural Language Toolkit: Evaluation
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import operator
+from functools import reduce
+from math import fabs
+from random import shuffle
+
+try:
+    from scipy.stats.stats import betai
+except ImportError:
+    betai = None
+
+from nltk.util import LazyConcatenation, LazyMap
+
+
+def accuracy(reference, test):
+    """
+    Given a list of reference values and a corresponding list of test
+    values, return the fraction of corresponding values that are
+    equal.  In particular, return the fraction of indices
+    ``0<i<=len(test)`` such that ``test[i] == reference[i]``.
+
+    :type reference: list
+    :param reference: An ordered list of reference values.
+    :type test: list
+    :param test: A list of values to compare against the corresponding
+        reference values.
+    :raise ValueError: If ``reference`` and ``length`` do not have the
+        same length.
+    """
+    if len(reference) != len(test):
+        raise ValueError("Lists must have the same length.")
+    return sum(x == y for x, y in zip(reference, test)) / len(test)
+
+
+def precision(reference, test):
+    """
+    Given a set of reference values and a set of test values, return
+    the fraction of test values that appear in the reference set.
+    In particular, return card(``reference`` intersection ``test``)/card(``test``).
+    If ``test`` is empty, then return None.
+
+    :type reference: set
+    :param reference: A set of reference values.
+    :type test: set
+    :param test: A set of values to compare against the reference set.
+    :rtype: float or None
+    """
+    if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
+        raise TypeError("reference and test should be sets")
+
+    if len(test) == 0:
+        return None
+    else:
+        return len(reference.intersection(test)) / len(test)
+
+
+def recall(reference, test):
+    """
+    Given a set of reference values and a set of test values, return
+    the fraction of reference values that appear in the test set.
+    In particular, return card(``reference`` intersection ``test``)/card(``reference``).
+    If ``reference`` is empty, then return None.
+
+    :type reference: set
+    :param reference: A set of reference values.
+    :type test: set
+    :param test: A set of values to compare against the reference set.
+    :rtype: float or None
+    """
+    if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
+        raise TypeError("reference and test should be sets")
+
+    if len(reference) == 0:
+        return None
+    else:
+        return len(reference.intersection(test)) / len(reference)
+
+
+def f_measure(reference, test, alpha=0.5):
+    """
+    Given a set of reference values and a set of test values, return
+    the f-measure of the test values, when compared against the
+    reference values.  The f-measure is the harmonic mean of the
+    ``precision`` and ``recall``, weighted by ``alpha``.  In particular,
+    given the precision *p* and recall *r* defined by:
+
+    - *p* = card(``reference`` intersection ``test``)/card(``test``)
+    - *r* = card(``reference`` intersection ``test``)/card(``reference``)
+
+    The f-measure is:
+
+    - *1/(alpha/p + (1-alpha)/r)*
+
+    If either ``reference`` or ``test`` is empty, then ``f_measure``
+    returns None.
+
+    :type reference: set
+    :param reference: A set of reference values.
+    :type test: set
+    :param test: A set of values to compare against the reference set.
+    :rtype: float or None
+    """
+    p = precision(reference, test)
+    r = recall(reference, test)
+    if p is None or r is None:
+        return None
+    if p == 0 or r == 0:
+        return 0
+    return 1.0 / (alpha / p + (1 - alpha) / r)
+
+
+def log_likelihood(reference, test):
+    """
+    Given a list of reference values and a corresponding list of test
+    probability distributions, return the average log likelihood of
+    the reference values, given the probability distributions.
+
+    :param reference: A list of reference values
+    :type reference: list
+    :param test: A list of probability distributions over values to
+        compare against the corresponding reference values.
+    :type test: list(ProbDistI)
+    """
+    if len(reference) != len(test):
+        raise ValueError("Lists must have the same length.")
+
+    # Return the average value of dist.logprob(val).
+    total_likelihood = sum(dist.logprob(val) for (val, dist) in zip(reference, test))
+    return total_likelihood / len(reference)
+
+
+def approxrand(a, b, **kwargs):
+    """
+    Returns an approximate significance level between two lists of
+    independently generated test values.
+
+    Approximate randomization calculates significance by randomly drawing
+    from a sample of the possible permutations. At the limit of the number
+    of possible permutations, the significance level is exact. The
+    approximate significance level is the sample mean number of times the
+    statistic of the permutated lists varies from the actual statistic of
+    the unpermuted argument lists.
+
+    :return: a tuple containing an approximate significance level, the count
+             of the number of times the pseudo-statistic varied from the
+             actual statistic, and the number of shuffles
+    :rtype: tuple
+    :param a: a list of test values
+    :type a: list
+    :param b: another list of independently generated test values
+    :type b: list
+    """
+    shuffles = kwargs.get("shuffles", 999)
+    # there's no point in trying to shuffle beyond all possible permutations
+    shuffles = min(shuffles, reduce(operator.mul, range(1, len(a) + len(b) + 1)))
+    stat = kwargs.get("statistic", lambda lst: sum(lst) / len(lst))
+    verbose = kwargs.get("verbose", False)
+
+    if verbose:
+        print("shuffles: %d" % shuffles)
+
+    actual_stat = fabs(stat(a) - stat(b))
+
+    if verbose:
+        print("actual statistic: %f" % actual_stat)
+        print("-" * 60)
+
+    c = 1e-100
+    lst = LazyConcatenation([a, b])
+    indices = list(range(len(a) + len(b)))
+
+    for i in range(shuffles):
+        if verbose and i % 10 == 0:
+            print("shuffle: %d" % i)
+
+        shuffle(indices)
+
+        pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[: len(a)]))
+        pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a) :]))
+        pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b)
+
+        if pseudo_stat >= actual_stat:
+            c += 1
+
+        if verbose and i % 10 == 0:
+            print("pseudo-statistic: %f" % pseudo_stat)
+            print("significance: %f" % ((c + 1) / (i + 1)))
+            print("-" * 60)
+
+    significance = (c + 1) / (shuffles + 1)
+
+    if verbose:
+        print("significance: %f" % significance)
+        if betai:
+            for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
+                print(f"prob(phi<={phi:f}): {betai(c, shuffles, phi):f}")
+
+    return (significance, c, shuffles)
+
+
+def demo():
+    print("-" * 75)
+    reference = "DET NN VB DET JJ NN NN IN DET NN".split()
+    test = "DET VB VB DET NN NN NN IN DET NN".split()
+    print("Reference =", reference)
+    print("Test    =", test)
+    print("Accuracy:", accuracy(reference, test))
+
+    print("-" * 75)
+    reference_set = set(reference)
+    test_set = set(test)
+    print("Reference =", reference_set)
+    print("Test =   ", test_set)
+    print("Precision:", precision(reference_set, test_set))
+    print("   Recall:", recall(reference_set, test_set))
+    print("F-Measure:", f_measure(reference_set, test_set))
+    print("-" * 75)
+
+
+if __name__ == "__main__":
+    demo()
--- a/backend/venv/Lib/site-packages/nltk/metrics/segmentation.py
+++ b/backend/venv/Lib/site-packages/nltk/metrics/segmentation.py
@@ -0,0 +1,222 @@
+# Natural Language Toolkit: Text Segmentation Metrics
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+#         David Doukhan <david.doukhan@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+
+"""
+Text Segmentation Metrics
+
+1. Windowdiff
+
+Pevzner, L., and Hearst, M., A Critique and Improvement of
+  an Evaluation Metric for Text Segmentation,
+  Computational Linguistics 28, 19-36
+
+
+2. Generalized Hamming Distance
+
+Bookstein A., Kulyukin V.A., Raita T.
+Generalized Hamming Distance
+Information Retrieval 5, 2002, pp 353-375
+
+Baseline implementation in C++
+http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html
+
+Study describing benefits of Generalized Hamming Distance Versus
+WindowDiff for evaluating text segmentation tasks
+Begsten, Y.  Quel indice pour mesurer l'efficacite en segmentation de textes ?
+TALN 2009
+
+
+3. Pk text segmentation metric
+
+Beeferman D., Berger A., Lafferty J. (1999)
+Statistical Models for Text Segmentation
+Machine Learning, 34, 177-210
+"""
+
+try:
+    import numpy as np
+except ImportError:
+    pass
+
+
+def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
+    """
+    Compute the windowdiff score for a pair of segmentations.  A
+    segmentation is any sequence over a vocabulary of two items
+    (e.g. "0", "1"), where the specified boundary value is used to
+    mark the edge of a segmentation.
+
+        >>> s1 = "000100000010"
+        >>> s2 = "000010000100"
+        >>> s3 = "100000010000"
+        >>> '%.2f' % windowdiff(s1, s1, 3)
+        '0.00'
+        >>> '%.2f' % windowdiff(s1, s2, 3)
+        '0.30'
+        >>> '%.2f' % windowdiff(s2, s3, 3)
+        '0.80'
+
+    :param seg1: a segmentation
+    :type seg1: str or list
+    :param seg2: a segmentation
+    :type seg2: str or list
+    :param k: window width
+    :type k: int
+    :param boundary: boundary value
+    :type boundary: str or int or bool
+    :param weighted: use the weighted variant of windowdiff
+    :type weighted: boolean
+    :rtype: float
+    """
+
+    if len(seg1) != len(seg2):
+        raise ValueError("Segmentations have unequal length")
+    if k > len(seg1):
+        raise ValueError(
+            "Window width k should be smaller or equal than segmentation lengths"
+        )
+    wd = 0
+    for i in range(len(seg1) - k + 1):
+        ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary))
+        if weighted:
+            wd += ndiff
+        else:
+            wd += min(1, ndiff)
+    return wd / (len(seg1) - k + 1.0)
+
+
+# Generalized Hamming Distance
+
+
+def _init_mat(nrows, ncols, ins_cost, del_cost):
+    mat = np.empty((nrows, ncols))
+    mat[0, :] = ins_cost * np.arange(ncols)
+    mat[:, 0] = del_cost * np.arange(nrows)
+    return mat
+
+
+def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff):
+    for i, rowi in enumerate(rowv):
+        for j, colj in enumerate(colv):
+            shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j]
+            if rowi == colj:
+                # boundaries are at the same location, no transformation required
+                tcost = mat[i, j]
+            elif rowi > colj:
+                # boundary match through a deletion
+                tcost = del_cost + mat[i, j + 1]
+            else:
+                # boundary match through an insertion
+                tcost = ins_cost + mat[i + 1, j]
+            mat[i + 1, j + 1] = min(tcost, shift_cost)
+
+
+def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"):
+    """
+    Compute the Generalized Hamming Distance for a reference and a hypothetical
+    segmentation, corresponding to the cost related to the transformation
+    of the hypothetical segmentation into the reference segmentation
+    through boundary insertion, deletion and shift operations.
+
+    A segmentation is any sequence over a vocabulary of two items
+    (e.g. "0", "1"), where the specified boundary value is used to
+    mark the edge of a segmentation.
+
+    Recommended parameter values are a shift_cost_coeff of 2.
+    Associated with a ins_cost, and del_cost equal to the mean segment
+    length in the reference segmentation.
+
+        >>> # Same examples as Kulyukin C++ implementation
+        >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
+        0.5
+        >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
+        2.0
+        >>> ghd('011', '110', 1.0, 1.0, 0.5)
+        1.0
+        >>> ghd('1', '0', 1.0, 1.0, 0.5)
+        1.0
+        >>> ghd('111', '000', 1.0, 1.0, 0.5)
+        3.0
+        >>> ghd('000', '111', 1.0, 2.0, 0.5)
+        6.0
+
+    :param ref: the reference segmentation
+    :type ref: str or list
+    :param hyp: the hypothetical segmentation
+    :type hyp: str or list
+    :param ins_cost: insertion cost
+    :type ins_cost: float
+    :param del_cost: deletion cost
+    :type del_cost: float
+    :param shift_cost_coeff: constant used to compute the cost of a shift.
+        ``shift cost = shift_cost_coeff * |i - j|`` where ``i`` and ``j``
+        are the positions indicating the shift
+    :type shift_cost_coeff: float
+    :param boundary: boundary value
+    :type boundary: str or int or bool
+    :rtype: float
+    """
+
+    ref_idx = [i for (i, val) in enumerate(ref) if val == boundary]
+    hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary]
+
+    nref_bound = len(ref_idx)
+    nhyp_bound = len(hyp_idx)
+
+    if nref_bound == 0 and nhyp_bound == 0:
+        return 0.0
+    elif nref_bound > 0 and nhyp_bound == 0:
+        return nref_bound * ins_cost
+    elif nref_bound == 0 and nhyp_bound > 0:
+        return nhyp_bound * del_cost
+
+    mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost)
+    _ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff)
+    return float(mat[-1, -1])
+
+
+# Beeferman's Pk text segmentation evaluation metric
+
+
+def pk(ref, hyp, k=None, boundary="1"):
+    """
+    Compute the Pk metric for a pair of segmentations A segmentation
+    is any sequence over a vocabulary of two items (e.g. "0", "1"),
+    where the specified boundary value is used to mark the edge of a
+    segmentation.
+
+    >>> '%.2f' % pk('0100'*100, '1'*400, 2)
+    '0.50'
+    >>> '%.2f' % pk('0100'*100, '0'*400, 2)
+    '0.50'
+    >>> '%.2f' % pk('0100'*100, '0100'*100, 2)
+    '0.00'
+
+    :param ref: the reference segmentation
+    :type ref: str or list
+    :param hyp: the segmentation to evaluate
+    :type hyp: str or list
+    :param k: window size, if None, set to half of the average reference segment length
+    :type boundary: str or int or bool
+    :param boundary: boundary value
+    :type boundary: str or int or bool
+    :rtype: float
+    """
+
+    if k is None:
+        k = int(round(len(ref) / (ref.count(boundary) * 2.0)))
+
+    err = 0
+    for i in range(len(ref) - k + 1):
+        r = ref[i : i + k].count(boundary) > 0
+        h = hyp[i : i + k].count(boundary) > 0
+        if r != h:
+            err += 1
+    return err / (len(ref) - k + 1.0)
--- a/backend/venv/Lib/site-packages/nltk/metrics/spearman.py
+++ b/backend/venv/Lib/site-packages/nltk/metrics/spearman.py
@@ -0,0 +1,68 @@
+# Natural Language Toolkit: Spearman Rank Correlation
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Joel Nothman <jnothman@student.usyd.edu.au>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Tools for comparing ranked lists.
+"""
+
+
+def _rank_dists(ranks1, ranks2):
+    """Finds the difference between the values in ranks1 and ranks2 for keys
+    present in both dicts. If the arguments are not dicts, they are converted
+    from (key, rank) sequences.
+    """
+    ranks1 = dict(ranks1)
+    ranks2 = dict(ranks2)
+    for k in ranks1:
+        try:
+            yield k, ranks1[k] - ranks2[k]
+        except KeyError:
+            pass
+
+
+def spearman_correlation(ranks1, ranks2):
+    """Returns the Spearman correlation coefficient for two rankings, which
+    should be dicts or sequences of (key, rank). The coefficient ranges from
+    -1.0 (ranks are opposite) to 1.0 (ranks are identical), and is only
+    calculated for keys in both rankings (for meaningful results, remove keys
+    present in only one list before ranking)."""
+    n = 0
+    res = 0
+    for k, d in _rank_dists(ranks1, ranks2):
+        res += d * d
+        n += 1
+    try:
+        return 1 - (6 * res / (n * (n * n - 1)))
+    except ZeroDivisionError:
+        # Result is undefined if only one item is ranked
+        return 0.0
+
+
+def ranks_from_sequence(seq):
+    """Given a sequence, yields each element with an increasing rank, suitable
+    for use as an argument to ``spearman_correlation``.
+    """
+    return ((k, i) for i, k in enumerate(seq))
+
+
+def ranks_from_scores(scores, rank_gap=1e-15):
+    """Given a sequence of (key, score) tuples, yields each key with an
+    increasing rank, tying with previous key's rank if the difference between
+    their scores is less than rank_gap. Suitable for use as an argument to
+    ``spearman_correlation``.
+    """
+    prev_score = None
+    rank = 0
+    for i, (key, score) in enumerate(scores):
+        try:
+            if abs(score - prev_score) > rank_gap:
+                rank = i
+        except TypeError:
+            pass
+
+        yield key, rank
+        prev_score = score