Initial commit

2026-02-01 09:31:38 +01:00
commit e02db93960
4396 changed files with 1511612 additions and 0 deletions
--- a/backend/venv/Lib/site-packages/nltk/classify/init.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/init.py
@@ -0,0 +1,101 @@
+# Natural Language Toolkit: Classifiers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Classes and interfaces for labeling tokens with category labels (or
+"class labels").  Typically, labels are represented with strings
+(such as ``'health'`` or ``'sports'``).  Classifiers can be used to
+perform a wide range of classification tasks.  For example,
+classifiers can be used...
+
+- to classify documents by topic
+- to classify ambiguous words by which word sense is intended
+- to classify acoustic signals by which phoneme they represent
+- to classify sentences by their author
+
+Features
+========
+In order to decide which category label is appropriate for a given
+token, classifiers examine one or more 'features' of the token.  These
+"features" are typically chosen by hand, and indicate which aspects
+of the token are relevant to the classification decision.  For
+example, a document classifier might use a separate feature for each
+word, recording how often that word occurred in the document.
+
+Featuresets
+===========
+The features describing a token are encoded using a "featureset",
+which is a dictionary that maps from "feature names" to "feature
+values".  Feature names are unique strings that indicate what aspect
+of the token is encoded by the feature.  Examples include
+``'prevword'``, for a feature whose value is the previous word; and
+``'contains-word(library)'`` for a feature that is true when a document
+contains the word ``'library'``.  Feature values are typically
+booleans, numbers, or strings, depending on which feature they
+describe.
+
+Featuresets are typically constructed using a "feature detector"
+(also known as a "feature extractor").  A feature detector is a
+function that takes a token (and sometimes information about its
+context) as its input, and returns a featureset describing that token.
+For example, the following feature detector converts a document
+(stored as a list of words) to a featureset describing the set of
+words included in the document:
+
+    >>> # Define a feature detector function.
+    >>> def document_features(document):
+    ...     return dict([('contains-word(%s)' % w, True) for w in document])
+
+Feature detectors are typically applied to each token before it is fed
+to the classifier:
+
+    >>> # Classify each Gutenberg document.
+    >>> from nltk.corpus import gutenberg
+    >>> for fileid in gutenberg.fileids(): # doctest: +SKIP
+    ...     doc = gutenberg.words(fileid) # doctest: +SKIP
+    ...     print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP
+
+The parameters that a feature detector expects will vary, depending on
+the task and the needs of the feature detector.  For example, a
+feature detector for word sense disambiguation (WSD) might take as its
+input a sentence, and the index of a word that should be classified,
+and return a featureset for that word.  The following feature detector
+for WSD includes features describing the left and right contexts of
+the target word:
+
+    >>> def wsd_features(sentence, index):
+    ...     featureset = {}
+    ...     for i in range(max(0, index-3), index):
+    ...         featureset['left-context(%s)' % sentence[i]] = True
+    ...     for i in range(index, max(index+3, len(sentence))):
+    ...         featureset['right-context(%s)' % sentence[i]] = True
+    ...     return featureset
+
+Training Classifiers
+====================
+Most classifiers are built by training them on a list of hand-labeled
+examples, known as the "training set".  Training sets are represented
+as lists of ``(featuredict, label)`` tuples.
+"""
+
+from nltk.classify.api import ClassifierI, MultiClassifierI
+from nltk.classify.decisiontree import DecisionTreeClassifier
+from nltk.classify.maxent import (
+    BinaryMaxentFeatureEncoding,
+    ConditionalExponentialClassifier,
+    MaxentClassifier,
+    TypedMaxentFeatureEncoding,
+)
+from nltk.classify.megam import call_megam, config_megam
+from nltk.classify.naivebayes import NaiveBayesClassifier
+from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier
+from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features
+from nltk.classify.scikitlearn import SklearnClassifier
+from nltk.classify.senna import Senna
+from nltk.classify.textcat import TextCat
+from nltk.classify.util import accuracy, apply_features, log_likelihood
+from nltk.classify.weka import WekaClassifier, config_weka
--- a/backend/venv/Lib/site-packages/nltk/classify/api.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/api.py
@@ -0,0 +1,195 @@
+# Natural Language Toolkit: Classifier Interface
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Interfaces for labeling tokens with category labels (or "class labels").
+
+``ClassifierI`` is a standard interface for "single-category
+classification", in which the set of categories is known, the number
+of categories is finite, and each text belongs to exactly one
+category.
+
+``MultiClassifierI`` is a standard interface for "multi-category
+classification", which is like single-category classification except
+that each text belongs to zero or more categories.
+"""
+from nltk.internals import overridden
+
+##//////////////////////////////////////////////////////
+# { Classification Interfaces
+##//////////////////////////////////////////////////////
+
+
+class ClassifierI:
+    """
+    A processing interface for labeling tokens with a single category
+    label (or "class").  Labels are typically strs or
+    ints, but can be any immutable type.  The set of labels
+    that the classifier chooses from must be fixed and finite.
+
+    Subclasses must define:
+      - ``labels()``
+      - either ``classify()`` or ``classify_many()`` (or both)
+
+    Subclasses may define:
+      - either ``prob_classify()`` or ``prob_classify_many()`` (or both)
+    """
+
+    def labels(self):
+        """
+        :return: the list of category labels used by this classifier.
+        :rtype: list of (immutable)
+        """
+        raise NotImplementedError()
+
+    def classify(self, featureset):
+        """
+        :return: the most appropriate label for the given featureset.
+        :rtype: label
+        """
+        if overridden(self.classify_many):
+            return self.classify_many([featureset])[0]
+        else:
+            raise NotImplementedError()
+
+    def prob_classify(self, featureset):
+        """
+        :return: a probability distribution over labels for the given
+            featureset.
+        :rtype: ProbDistI
+        """
+        if overridden(self.prob_classify_many):
+            return self.prob_classify_many([featureset])[0]
+        else:
+            raise NotImplementedError()
+
+    def classify_many(self, featuresets):
+        """
+        Apply ``self.classify()`` to each element of ``featuresets``.  I.e.:
+
+            return [self.classify(fs) for fs in featuresets]
+
+        :rtype: list(label)
+        """
+        return [self.classify(fs) for fs in featuresets]
+
+    def prob_classify_many(self, featuresets):
+        """
+        Apply ``self.prob_classify()`` to each element of ``featuresets``.  I.e.:
+
+            return [self.prob_classify(fs) for fs in featuresets]
+
+        :rtype: list(ProbDistI)
+        """
+        return [self.prob_classify(fs) for fs in featuresets]
+
+
+class MultiClassifierI:
+    """
+    A processing interface for labeling tokens with zero or more
+    category labels (or "labels").  Labels are typically strs
+    or ints, but can be any immutable type.  The set of labels
+    that the multi-classifier chooses from must be fixed and finite.
+
+    Subclasses must define:
+      - ``labels()``
+      - either ``classify()`` or ``classify_many()`` (or both)
+
+    Subclasses may define:
+      - either ``prob_classify()`` or ``prob_classify_many()`` (or both)
+    """
+
+    def labels(self):
+        """
+        :return: the list of category labels used by this classifier.
+        :rtype: list of (immutable)
+        """
+        raise NotImplementedError()
+
+    def classify(self, featureset):
+        """
+        :return: the most appropriate set of labels for the given featureset.
+        :rtype: set(label)
+        """
+        if overridden(self.classify_many):
+            return self.classify_many([featureset])[0]
+        else:
+            raise NotImplementedError()
+
+    def prob_classify(self, featureset):
+        """
+        :return: a probability distribution over sets of labels for the
+            given featureset.
+        :rtype: ProbDistI
+        """
+        if overridden(self.prob_classify_many):
+            return self.prob_classify_many([featureset])[0]
+        else:
+            raise NotImplementedError()
+
+    def classify_many(self, featuresets):
+        """
+        Apply ``self.classify()`` to each element of ``featuresets``.  I.e.:
+
+            return [self.classify(fs) for fs in featuresets]
+
+        :rtype: list(set(label))
+        """
+        return [self.classify(fs) for fs in featuresets]
+
+    def prob_classify_many(self, featuresets):
+        """
+        Apply ``self.prob_classify()`` to each element of ``featuresets``.  I.e.:
+
+            return [self.prob_classify(fs) for fs in featuresets]
+
+        :rtype: list(ProbDistI)
+        """
+        return [self.prob_classify(fs) for fs in featuresets]
+
+
+# # [XX] IN PROGRESS:
+# class SequenceClassifierI:
+#     """
+#     A processing interface for labeling sequences of tokens with a
+#     single category label (or "class").  Labels are typically
+#     strs or ints, but can be any immutable type.  The set
+#     of labels that the classifier chooses from must be fixed and
+#     finite.
+#     """
+#     def labels(self):
+#         """
+#         :return: the list of category labels used by this classifier.
+#         :rtype: list of (immutable)
+#         """
+#         raise NotImplementedError()
+
+#     def prob_classify(self, featureset):
+#         """
+#         Return a probability distribution over labels for the given
+#         featureset.
+
+#         If ``featureset`` is a list of featuresets, then return a
+#         corresponding list containing the probability distribution
+#         over labels for each of the given featuresets, where the
+#         *i*\ th element of this list is the most appropriate label for
+#         the *i*\ th element of ``featuresets``.
+#         """
+#         raise NotImplementedError()
+
+#     def classify(self, featureset):
+#         """
+#         Return the most appropriate label for the given featureset.
+
+#         If ``featureset`` is a list of featuresets, then return a
+#         corresponding list containing the most appropriate label for
+#         each of the given featuresets, where the *i*\ th element of
+#         this list is the most appropriate label for the *i*\ th element
+#         of ``featuresets``.
+#         """
+#         raise NotImplementedError()
--- a/backend/venv/Lib/site-packages/nltk/classify/decisiontree.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/decisiontree.py
@@ -0,0 +1,349 @@
+# Natural Language Toolkit: Decision Tree Classifiers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A classifier model that decides which label to assign to a token on
+the basis of a tree structure, where branches correspond to conditions
+on feature values, and leaves correspond to label assignments.
+"""
+
+from collections import defaultdict
+
+from nltk.classify.api import ClassifierI
+from nltk.probability import FreqDist, MLEProbDist, entropy
+
+
+class DecisionTreeClassifier(ClassifierI):
+    def __init__(self, label, feature_name=None, decisions=None, default=None):
+        """
+        :param label: The most likely label for tokens that reach
+            this node in the decision tree.  If this decision tree
+            has no children, then this label will be assigned to
+            any token that reaches this decision tree.
+        :param feature_name: The name of the feature that this
+            decision tree selects for.
+        :param decisions: A dictionary mapping from feature values
+            for the feature identified by ``feature_name`` to
+            child decision trees.
+        :param default: The child that will be used if the value of
+            feature ``feature_name`` does not match any of the keys in
+            ``decisions``.  This is used when constructing binary
+            decision trees.
+        """
+        self._label = label
+        self._fname = feature_name
+        self._decisions = decisions
+        self._default = default
+
+    def labels(self):
+        labels = [self._label]
+        if self._decisions is not None:
+            for dt in self._decisions.values():
+                labels.extend(dt.labels())
+        if self._default is not None:
+            labels.extend(self._default.labels())
+        return list(set(labels))
+
+    def classify(self, featureset):
+        # Decision leaf:
+        if self._fname is None:
+            return self._label
+
+        # Decision tree:
+        fval = featureset.get(self._fname)
+        if fval in self._decisions:
+            return self._decisions[fval].classify(featureset)
+        elif self._default is not None:
+            return self._default.classify(featureset)
+        else:
+            return self._label
+
+    def error(self, labeled_featuresets):
+        errors = 0
+        for featureset, label in labeled_featuresets:
+            if self.classify(featureset) != label:
+                errors += 1
+        return errors / len(labeled_featuresets)
+
+    def pretty_format(self, width=70, prefix="", depth=4):
+        """
+        Return a string containing a pretty-printed version of this
+        decision tree.  Each line in this string corresponds to a
+        single decision tree node or leaf, and indentation is used to
+        display the structure of the decision tree.
+        """
+        # [xx] display default!!
+        if self._fname is None:
+            n = width - len(prefix) - 15
+            return "{}{} {}\n".format(prefix, "." * n, self._label)
+        s = ""
+        for i, (fval, result) in enumerate(
+            sorted(
+                self._decisions.items(),
+                key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
+            )
+        ):
+            hdr = f"{prefix}{self._fname}={fval}? "
+            n = width - 15 - len(hdr)
+            s += "{}{} {}\n".format(hdr, "." * (n), result._label)
+            if result._fname is not None and depth > 1:
+                s += result.pretty_format(width, prefix + "  ", depth - 1)
+        if self._default is not None:
+            n = width - len(prefix) - 21
+            s += "{}else: {} {}\n".format(prefix, "." * n, self._default._label)
+            if self._default._fname is not None and depth > 1:
+                s += self._default.pretty_format(width, prefix + "  ", depth - 1)
+        return s
+
+    def pseudocode(self, prefix="", depth=4):
+        """
+        Return a string representation of this decision tree that
+        expresses the decisions it makes as a nested set of pseudocode
+        if statements.
+        """
+        if self._fname is None:
+            return f"{prefix}return {self._label!r}\n"
+        s = ""
+        for fval, result in sorted(
+            self._decisions.items(),
+            key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
+        ):
+            s += f"{prefix}if {self._fname} == {fval!r}: "
+            if result._fname is not None and depth > 1:
+                s += "\n" + result.pseudocode(prefix + "  ", depth - 1)
+            else:
+                s += f"return {result._label!r}\n"
+        if self._default is not None:
+            if len(self._decisions) == 1:
+                s += "{}if {} != {!r}: ".format(
+                    prefix, self._fname, list(self._decisions.keys())[0]
+                )
+            else:
+                s += f"{prefix}else: "
+            if self._default._fname is not None and depth > 1:
+                s += "\n" + self._default.pseudocode(prefix + "  ", depth - 1)
+            else:
+                s += f"return {self._default._label!r}\n"
+        return s
+
+    def __str__(self):
+        return self.pretty_format()
+
+    @staticmethod
+    def train(
+        labeled_featuresets,
+        entropy_cutoff=0.05,
+        depth_cutoff=100,
+        support_cutoff=10,
+        binary=False,
+        feature_values=None,
+        verbose=False,
+    ):
+        """
+        :param binary: If true, then treat all feature/value pairs as
+            individual binary features, rather than using a single n-way
+            branch for each feature.
+        """
+        # Collect a list of all feature names.
+        feature_names = set()
+        for featureset, label in labeled_featuresets:
+            for fname in featureset:
+                feature_names.add(fname)
+
+        # Collect a list of the values each feature can take.
+        if feature_values is None and binary:
+            feature_values = defaultdict(set)
+            for featureset, label in labeled_featuresets:
+                for fname, fval in featureset.items():
+                    feature_values[fname].add(fval)
+
+        # Start with a stump.
+        if not binary:
+            tree = DecisionTreeClassifier.best_stump(
+                feature_names, labeled_featuresets, verbose
+            )
+        else:
+            tree = DecisionTreeClassifier.best_binary_stump(
+                feature_names, labeled_featuresets, feature_values, verbose
+            )
+
+        # Refine the stump.
+        tree.refine(
+            labeled_featuresets,
+            entropy_cutoff,
+            depth_cutoff - 1,
+            support_cutoff,
+            binary,
+            feature_values,
+            verbose,
+        )
+
+        # Return it
+        return tree
+
+    @staticmethod
+    def leaf(labeled_featuresets):
+        label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
+        return DecisionTreeClassifier(label)
+
+    @staticmethod
+    def stump(feature_name, labeled_featuresets):
+        label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
+
+        # Find the best label for each value.
+        freqs = defaultdict(FreqDist)  # freq(label|value)
+        for featureset, label in labeled_featuresets:
+            feature_value = featureset.get(feature_name)
+            freqs[feature_value][label] += 1
+
+        decisions = {val: DecisionTreeClassifier(freqs[val].max()) for val in freqs}
+        return DecisionTreeClassifier(label, feature_name, decisions)
+
+    def refine(
+        self,
+        labeled_featuresets,
+        entropy_cutoff,
+        depth_cutoff,
+        support_cutoff,
+        binary=False,
+        feature_values=None,
+        verbose=False,
+    ):
+        if len(labeled_featuresets) <= support_cutoff:
+            return
+        if self._fname is None:
+            return
+        if depth_cutoff <= 0:
+            return
+        for fval in self._decisions:
+            fval_featuresets = [
+                (featureset, label)
+                for (featureset, label) in labeled_featuresets
+                if featureset.get(self._fname) == fval
+            ]
+
+            label_freqs = FreqDist(label for (featureset, label) in fval_featuresets)
+            if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
+                self._decisions[fval] = DecisionTreeClassifier.train(
+                    fval_featuresets,
+                    entropy_cutoff,
+                    depth_cutoff,
+                    support_cutoff,
+                    binary,
+                    feature_values,
+                    verbose,
+                )
+        if self._default is not None:
+            default_featuresets = [
+                (featureset, label)
+                for (featureset, label) in labeled_featuresets
+                if featureset.get(self._fname) not in self._decisions
+            ]
+            label_freqs = FreqDist(label for (featureset, label) in default_featuresets)
+            if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
+                self._default = DecisionTreeClassifier.train(
+                    default_featuresets,
+                    entropy_cutoff,
+                    depth_cutoff,
+                    support_cutoff,
+                    binary,
+                    feature_values,
+                    verbose,
+                )
+
+    @staticmethod
+    def best_stump(feature_names, labeled_featuresets, verbose=False):
+        best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
+        best_error = best_stump.error(labeled_featuresets)
+        for fname in feature_names:
+            stump = DecisionTreeClassifier.stump(fname, labeled_featuresets)
+            stump_error = stump.error(labeled_featuresets)
+            if stump_error < best_error:
+                best_error = stump_error
+                best_stump = stump
+        if verbose:
+            print(
+                "best stump for {:6d} toks uses {:20} err={:6.4f}".format(
+                    len(labeled_featuresets), best_stump._fname, best_error
+                )
+            )
+        return best_stump
+
+    @staticmethod
+    def binary_stump(feature_name, feature_value, labeled_featuresets):
+        label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
+
+        # Find the best label for each value.
+        pos_fdist = FreqDist()
+        neg_fdist = FreqDist()
+        for featureset, label in labeled_featuresets:
+            if featureset.get(feature_name) == feature_value:
+                pos_fdist[label] += 1
+            else:
+                neg_fdist[label] += 1
+
+        decisions = {}
+        default = label
+        # But hopefully we have observations!
+        if pos_fdist.N() > 0:
+            decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
+        if neg_fdist.N() > 0:
+            default = DecisionTreeClassifier(neg_fdist.max())
+
+        return DecisionTreeClassifier(label, feature_name, decisions, default)
+
+    @staticmethod
+    def best_binary_stump(
+        feature_names, labeled_featuresets, feature_values, verbose=False
+    ):
+        best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
+        best_error = best_stump.error(labeled_featuresets)
+        for fname in feature_names:
+            for fval in feature_values[fname]:
+                stump = DecisionTreeClassifier.binary_stump(
+                    fname, fval, labeled_featuresets
+                )
+                stump_error = stump.error(labeled_featuresets)
+                if stump_error < best_error:
+                    best_error = stump_error
+                    best_stump = stump
+        if verbose:
+            if best_stump._decisions:
+                descr = "{}={}".format(
+                    best_stump._fname, list(best_stump._decisions.keys())[0]
+                )
+            else:
+                descr = "(default)"
+            print(
+                "best stump for {:6d} toks uses {:20} err={:6.4f}".format(
+                    len(labeled_featuresets), descr, best_error
+                )
+            )
+        return best_stump
+
+
+##//////////////////////////////////////////////////////
+##  Demo
+##//////////////////////////////////////////////////////
+
+
+def f(x):
+    return DecisionTreeClassifier.train(x, binary=True, verbose=True)
+
+
+def demo():
+    from nltk.classify.util import binary_names_demo_features, names_demo
+
+    classifier = names_demo(
+        f, binary_names_demo_features  # DecisionTreeClassifier.train,
+    )
+    print(classifier.pretty_format(depth=7))
+    print(classifier.pseudocode(depth=7))
+
+
+if __name__ == "__main__":
+    demo()
--- a/backend/venv/Lib/site-packages/nltk/classify/maxent.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/maxent.py
--- a/backend/venv/Lib/site-packages/nltk/classify/megam.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/megam.py
@@ -0,0 +1,184 @@
+# Natural Language Toolkit: Interface to Megam Classifier
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A set of functions used to interface with the external megam_ maxent
+optimization package. Before megam can be used, you should tell NLTK where it
+can find the megam binary, using the ``config_megam()`` function. Typical
+usage:
+
+    >>> from nltk.classify import megam
+    >>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP
+    [Found megam: ...]
+
+Use with MaxentClassifier. Example below, see MaxentClassifier documentation
+for details.
+
+    nltk.classify.MaxentClassifier.train(corpus, 'megam')
+
+.. _megam: https://www.umiacs.umd.edu/~hal/megam/index.html
+"""
+import subprocess
+
+from nltk.internals import find_binary
+
+try:
+    import numpy
+except ImportError:
+    numpy = None
+
+######################################################################
+# { Configuration
+######################################################################
+
+_megam_bin = None
+
+
+def config_megam(bin=None):
+    """
+    Configure NLTK's interface to the ``megam`` maxent optimization
+    package.
+
+    :param bin: The full path to the ``megam`` binary.  If not specified,
+        then nltk will search the system for a ``megam`` binary; and if
+        one is not found, it will raise a ``LookupError`` exception.
+    :type bin: str
+    """
+    global _megam_bin
+    _megam_bin = find_binary(
+        "megam",
+        bin,
+        env_vars=["MEGAM"],
+        binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
+        url="https://www.umiacs.umd.edu/~hal/megam/index.html",
+    )
+
+
+######################################################################
+# { Megam Interface Functions
+######################################################################
+
+
+def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True):
+    """
+    Generate an input file for ``megam`` based on the given corpus of
+    classified tokens.
+
+    :type train_toks: list(tuple(dict, str))
+    :param train_toks: Training data, represented as a list of
+        pairs, the first member of which is a feature dictionary,
+        and the second of which is a classification label.
+
+    :type encoding: MaxentFeatureEncodingI
+    :param encoding: A feature encoding, used to convert featuresets
+        into feature vectors. May optionally implement a cost() method
+        in order to assign different costs to different class predictions.
+
+    :type stream: stream
+    :param stream: The stream to which the megam input file should be
+        written.
+
+    :param bernoulli: If true, then use the 'bernoulli' format.  I.e.,
+        all joint features have binary values, and are listed iff they
+        are true.  Otherwise, list feature values explicitly.  If
+        ``bernoulli=False``, then you must call ``megam`` with the
+        ``-fvals`` option.
+
+    :param explicit: If true, then use the 'explicit' format.  I.e.,
+        list the features that would fire for any of the possible
+        labels, for each token.  If ``explicit=True``, then you must
+        call ``megam`` with the ``-explicit`` option.
+    """
+    # Look up the set of labels.
+    labels = encoding.labels()
+    labelnum = {label: i for (i, label) in enumerate(labels)}
+
+    # Write the file, which contains one line per instance.
+    for featureset, label in train_toks:
+        # First, the instance number (or, in the weighted multiclass case, the cost of each label).
+        if hasattr(encoding, "cost"):
+            stream.write(
+                ":".join(str(encoding.cost(featureset, label, l)) for l in labels)
+            )
+        else:
+            stream.write("%d" % labelnum[label])
+
+        # For implicit file formats, just list the features that fire
+        # for this instance's actual label.
+        if not explicit:
+            _write_megam_features(encoding.encode(featureset, label), stream, bernoulli)
+
+        # For explicit formats, list the features that would fire for
+        # any of the possible labels.
+        else:
+            for l in labels:
+                stream.write(" #")
+                _write_megam_features(encoding.encode(featureset, l), stream, bernoulli)
+
+        # End of the instance.
+        stream.write("\n")
+
+
+def parse_megam_weights(s, features_count, explicit=True):
+    """
+    Given the stdout output generated by ``megam`` when training a
+    model, return a ``numpy`` array containing the corresponding weight
+    vector.  This function does not currently handle bias features.
+    """
+    if numpy is None:
+        raise ValueError("This function requires that numpy be installed")
+    assert explicit, "non-explicit not supported yet"
+    lines = s.strip().split("\n")
+    weights = numpy.zeros(features_count, "d")
+    for line in lines:
+        if line.strip():
+            fid, weight = line.split()
+            weights[int(fid)] = float(weight)
+    return weights
+
+
+def _write_megam_features(vector, stream, bernoulli):
+    if not vector:
+        raise ValueError(
+            "MEGAM classifier requires the use of an " "always-on feature."
+        )
+    for fid, fval in vector:
+        if bernoulli:
+            if fval == 1:
+                stream.write(" %s" % fid)
+            elif fval != 0:
+                raise ValueError(
+                    "If bernoulli=True, then all" "features must be binary."
+                )
+        else:
+            stream.write(f" {fid} {fval}")
+
+
+def call_megam(args):
+    """
+    Call the ``megam`` binary with the given arguments.
+    """
+    if isinstance(args, str):
+        raise TypeError("args should be a list of strings")
+    if _megam_bin is None:
+        config_megam()
+
+    # Call megam via a subprocess
+    cmd = [_megam_bin] + args
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    (stdout, stderr) = p.communicate()
+
+    # Check the return code.
+    if p.returncode != 0:
+        print()
+        print(stderr)
+        raise OSError("megam command failed!")
+
+    if isinstance(stdout, str):
+        return stdout
+    else:
+        return stdout.decode("utf-8")
--- a/backend/venv/Lib/site-packages/nltk/classify/naivebayes.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/naivebayes.py
@@ -0,0 +1,260 @@
+# Natural Language Toolkit: Naive Bayes Classifiers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A classifier based on the Naive Bayes algorithm.  In order to find the
+probability for a label, this algorithm first uses the Bayes rule to
+express P(label|features) in terms of P(label) and P(features|label):
+
+|                       P(label) * P(features|label)
+|  P(label|features) = ------------------------------
+|                              P(features)
+
+The algorithm then makes the 'naive' assumption that all features are
+independent, given the label:
+
+|                       P(label) * P(f1|label) * ... * P(fn|label)
+|  P(label|features) = --------------------------------------------
+|                                         P(features)
+
+Rather than computing P(features) explicitly, the algorithm just
+calculates the numerator for each label, and normalizes them so they
+sum to one:
+
+|                       P(label) * P(f1|label) * ... * P(fn|label)
+|  P(label|features) = --------------------------------------------
+|                        SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
+"""
+
+from collections import defaultdict
+
+from nltk.classify.api import ClassifierI
+from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs
+
+##//////////////////////////////////////////////////////
+##  Naive Bayes Classifier
+##//////////////////////////////////////////////////////
+
+
+class NaiveBayesClassifier(ClassifierI):
+    """
+    A Naive Bayes classifier.  Naive Bayes classifiers are
+    paramaterized by two probability distributions:
+
+      - P(label) gives the probability that an input will receive each
+        label, given no information about the input's features.
+
+      - P(fname=fval|label) gives the probability that a given feature
+        (fname) will receive a given value (fval), given that the
+        label (label).
+
+    If the classifier encounters an input with a feature that has
+    never been seen with any label, then rather than assigning a
+    probability of 0 to all labels, it will ignore that feature.
+
+    The feature value 'None' is reserved for unseen feature values;
+    you generally should not use 'None' as a feature value for one of
+    your own features.
+    """
+
+    def __init__(self, label_probdist, feature_probdist):
+        """
+        :param label_probdist: P(label), the probability distribution
+            over labels.  It is expressed as a ``ProbDistI`` whose
+            samples are labels.  I.e., P(label) =
+            ``label_probdist.prob(label)``.
+
+        :param feature_probdist: P(fname=fval|label), the probability
+            distribution for feature values, given labels.  It is
+            expressed as a dictionary whose keys are ``(label, fname)``
+            pairs and whose values are ``ProbDistI`` objects over feature
+            values.  I.e., P(fname=fval|label) =
+            ``feature_probdist[label,fname].prob(fval)``.  If a given
+            ``(label,fname)`` is not a key in ``feature_probdist``, then
+            it is assumed that the corresponding P(fname=fval|label)
+            is 0 for all values of ``fval``.
+        """
+        self._label_probdist = label_probdist
+        self._feature_probdist = feature_probdist
+        self._labels = list(label_probdist.samples())
+
+    def labels(self):
+        return self._labels
+
+    def classify(self, featureset):
+        return self.prob_classify(featureset).max()
+
+    def prob_classify(self, featureset):
+        # Discard any feature names that we've never seen before.
+        # Otherwise, we'll just assign a probability of 0 to
+        # everything.
+        featureset = featureset.copy()
+        for fname in list(featureset.keys()):
+            for label in self._labels:
+                if (label, fname) in self._feature_probdist:
+                    break
+            else:
+                # print('Ignoring unseen feature %s' % fname)
+                del featureset[fname]
+
+        # Find the log probability of each label, given the features.
+        # Start with the log probability of the label itself.
+        logprob = {}
+        for label in self._labels:
+            logprob[label] = self._label_probdist.logprob(label)
+
+        # Then add in the log probability of features given labels.
+        for label in self._labels:
+            for fname, fval in featureset.items():
+                if (label, fname) in self._feature_probdist:
+                    feature_probs = self._feature_probdist[label, fname]
+                    logprob[label] += feature_probs.logprob(fval)
+                else:
+                    # nb: This case will never come up if the
+                    # classifier was created by
+                    # NaiveBayesClassifier.train().
+                    logprob[label] += sum_logs([])  # = -INF.
+
+        return DictionaryProbDist(logprob, normalize=True, log=True)
+
+    def show_most_informative_features(self, n=10):
+        # Determine the most relevant features, and display them.
+        cpdist = self._feature_probdist
+        print("Most Informative Features")
+
+        for fname, fval in self.most_informative_features(n):
+
+            def labelprob(l):
+                return cpdist[l, fname].prob(fval)
+
+            labels = sorted(
+                (l for l in self._labels if fval in cpdist[l, fname].samples()),
+                key=lambda element: (-labelprob(element), element),
+                reverse=True,
+            )
+            if len(labels) == 1:
+                continue
+            l0 = labels[0]
+            l1 = labels[-1]
+            if cpdist[l0, fname].prob(fval) == 0:
+                ratio = "INF"
+            else:
+                ratio = "%8.1f" % (
+                    cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
+                )
+            print(
+                "%24s = %-14r %6s : %-6s = %s : 1.0"
+                % (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
+            )
+
+    def most_informative_features(self, n=100):
+        """
+        Return a list of the 'most informative' features used by this
+        classifier.  For the purpose of this function, the
+        informativeness of a feature ``(fname,fval)`` is equal to the
+        highest value of P(fname=fval|label), for any label, divided by
+        the lowest value of P(fname=fval|label), for any label:
+
+        |  max[ P(fname=fval|label1) / P(fname=fval|label2) ]
+        """
+        if hasattr(self, "_most_informative_features"):
+            return self._most_informative_features[:n]
+        else:
+            # The set of (fname, fval) pairs used by this classifier.
+            features = set()
+            # The max & min probability associated w/ each (fname, fval)
+            # pair.  Maps (fname,fval) -> float.
+            maxprob = defaultdict(float)
+            minprob = defaultdict(lambda: 1.0)
+
+            for (label, fname), probdist in self._feature_probdist.items():
+                for fval in probdist.samples():
+                    feature = (fname, fval)
+                    features.add(feature)
+                    p = probdist.prob(fval)
+                    maxprob[feature] = max(p, maxprob[feature])
+                    minprob[feature] = min(p, minprob[feature])
+                    if minprob[feature] == 0:
+                        features.discard(feature)
+
+            # Convert features to a list, & sort it by how informative
+            # features are.
+            self._most_informative_features = sorted(
+                features,
+                key=lambda feature_: (
+                    minprob[feature_] / maxprob[feature_],
+                    feature_[0],
+                    feature_[1] in [None, False, True],
+                    str(feature_[1]).lower(),
+                ),
+            )
+        return self._most_informative_features[:n]
+
+    @classmethod
+    def train(cls, labeled_featuresets, estimator=ELEProbDist):
+        """
+        :param labeled_featuresets: A list of classified featuresets,
+            i.e., a list of tuples ``(featureset, label)``.
+        """
+        label_freqdist = FreqDist()
+        feature_freqdist = defaultdict(FreqDist)
+        feature_values = defaultdict(set)
+        fnames = set()
+
+        # Count up how many times each feature value occurred, given
+        # the label and featurename.
+        for featureset, label in labeled_featuresets:
+            label_freqdist[label] += 1
+            for fname, fval in featureset.items():
+                # Increment freq(fval|label, fname)
+                feature_freqdist[label, fname][fval] += 1
+                # Record that fname can take the value fval.
+                feature_values[fname].add(fval)
+                # Keep a list of all feature names.
+                fnames.add(fname)
+
+        # If a feature didn't have a value given for an instance, then
+        # we assume that it gets the implicit value 'None.'  This loop
+        # counts up the number of 'missing' feature values for each
+        # (label,fname) pair, and increments the count of the fval
+        # 'None' by that amount.
+        for label in label_freqdist:
+            num_samples = label_freqdist[label]
+            for fname in fnames:
+                count = feature_freqdist[label, fname].N()
+                # Only add a None key when necessary, i.e. if there are
+                # any samples with feature 'fname' missing.
+                if num_samples - count > 0:
+                    feature_freqdist[label, fname][None] += num_samples - count
+                    feature_values[fname].add(None)
+
+        # Create the P(label) distribution
+        label_probdist = estimator(label_freqdist)
+
+        # Create the P(fval|label, fname) distribution
+        feature_probdist = {}
+        for (label, fname), freqdist in feature_freqdist.items():
+            probdist = estimator(freqdist, bins=len(feature_values[fname]))
+            feature_probdist[label, fname] = probdist
+
+        return cls(label_probdist, feature_probdist)
+
+
+##//////////////////////////////////////////////////////
+##  Demo
+##//////////////////////////////////////////////////////
+
+
+def demo():
+    from nltk.classify.util import names_demo
+
+    classifier = names_demo(NaiveBayesClassifier.train)
+    classifier.show_most_informative_features()
+
+
+if __name__ == "__main__":
+    demo()
--- a/backend/venv/Lib/site-packages/nltk/classify/positivenaivebayes.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/positivenaivebayes.py
@@ -0,0 +1,180 @@
+# Natural Language Toolkit: Positive Naive Bayes Classifier
+#
+# Copyright (C) 2012 NLTK Project
+# Author: Alessandro Presta <alessandro.presta@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A variant of the Naive Bayes Classifier that performs binary classification with
+partially-labeled training sets. In other words, assume we want to build a classifier
+that assigns each example to one of two complementary classes (e.g., male names and
+female names).
+If we have a training set with labeled examples for both classes, we can use a
+standard Naive Bayes Classifier. However, consider the case when we only have labeled
+examples for one of the classes, and other, unlabeled, examples.
+Then, assuming a prior distribution on the two labels, we can use the unlabeled set
+to estimate the frequencies of the various features.
+
+Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1
+and unlabeled examples. We are also given an estimate of P(1).
+
+We compute P(feature|1) exactly as in the standard case.
+
+To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are
+assuming that the unlabeled examples are drawn according to the given prior distribution)
+and then express the conditional probability as:
+
+|                  P(feature) - P(feature|1) * P(1)
+|  P(feature|0) = ----------------------------------
+|                               P(0)
+
+Example:
+
+    >>> from nltk.classify import PositiveNaiveBayesClassifier
+
+Some sentences about sports:
+
+    >>> sports_sentences = [ 'The team dominated the game',
+    ...                      'They lost the ball',
+    ...                      'The game was intense',
+    ...                      'The goalkeeper catched the ball',
+    ...                      'The other team controlled the ball' ]
+
+Mixed topics, including sports:
+
+    >>> various_sentences = [ 'The President did not comment',
+    ...                       'I lost the keys',
+    ...                       'The team won the game',
+    ...                       'Sara has two kids',
+    ...                       'The ball went off the court',
+    ...                       'They had the ball for the whole game',
+    ...                       'The show is over' ]
+
+The features of a sentence are simply the words it contains:
+
+    >>> def features(sentence):
+    ...     words = sentence.lower().split()
+    ...     return dict(('contains(%s)' % w, True) for w in words)
+
+We use the sports sentences as positive examples, the mixed ones ad unlabeled examples:
+
+    >>> positive_featuresets = map(features, sports_sentences)
+    >>> unlabeled_featuresets = map(features, various_sentences)
+    >>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
+    ...                                                 unlabeled_featuresets)
+
+Is the following sentence about sports?
+
+    >>> classifier.classify(features('The cat is on the table'))
+    False
+
+What about this one?
+
+    >>> classifier.classify(features('My team lost the game'))
+    True
+"""
+
+from collections import defaultdict
+
+from nltk.classify.naivebayes import NaiveBayesClassifier
+from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist
+
+##//////////////////////////////////////////////////////
+##  Positive Naive Bayes Classifier
+##//////////////////////////////////////////////////////
+
+
+class PositiveNaiveBayesClassifier(NaiveBayesClassifier):
+    @staticmethod
+    def train(
+        positive_featuresets,
+        unlabeled_featuresets,
+        positive_prob_prior=0.5,
+        estimator=ELEProbDist,
+    ):
+        """
+        :param positive_featuresets: An iterable of featuresets that are known as positive
+            examples (i.e., their label is ``True``).
+
+        :param unlabeled_featuresets: An iterable of featuresets whose label is unknown.
+
+        :param positive_prob_prior: A prior estimate of the probability of the label
+            ``True`` (default 0.5).
+        """
+        positive_feature_freqdist = defaultdict(FreqDist)
+        unlabeled_feature_freqdist = defaultdict(FreqDist)
+        feature_values = defaultdict(set)
+        fnames = set()
+
+        # Count up how many times each feature value occurred in positive examples.
+        num_positive_examples = 0
+        for featureset in positive_featuresets:
+            for fname, fval in featureset.items():
+                positive_feature_freqdist[fname][fval] += 1
+                feature_values[fname].add(fval)
+                fnames.add(fname)
+            num_positive_examples += 1
+
+        # Count up how many times each feature value occurred in unlabeled examples.
+        num_unlabeled_examples = 0
+        for featureset in unlabeled_featuresets:
+            for fname, fval in featureset.items():
+                unlabeled_feature_freqdist[fname][fval] += 1
+                feature_values[fname].add(fval)
+                fnames.add(fname)
+            num_unlabeled_examples += 1
+
+        # If a feature didn't have a value given for an instance, then we assume that
+        # it gets the implicit value 'None'.
+        for fname in fnames:
+            count = positive_feature_freqdist[fname].N()
+            positive_feature_freqdist[fname][None] += num_positive_examples - count
+            feature_values[fname].add(None)
+
+        for fname in fnames:
+            count = unlabeled_feature_freqdist[fname].N()
+            unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count
+            feature_values[fname].add(None)
+
+        negative_prob_prior = 1.0 - positive_prob_prior
+
+        # Create the P(label) distribution.
+        label_probdist = DictionaryProbDist(
+            {True: positive_prob_prior, False: negative_prob_prior}
+        )
+
+        # Create the P(fval|label, fname) distribution.
+        feature_probdist = {}
+        for fname, freqdist in positive_feature_freqdist.items():
+            probdist = estimator(freqdist, bins=len(feature_values[fname]))
+            feature_probdist[True, fname] = probdist
+
+        for fname, freqdist in unlabeled_feature_freqdist.items():
+            global_probdist = estimator(freqdist, bins=len(feature_values[fname]))
+            negative_feature_probs = {}
+            for fval in feature_values[fname]:
+                prob = (
+                    global_probdist.prob(fval)
+                    - positive_prob_prior * feature_probdist[True, fname].prob(fval)
+                ) / negative_prob_prior
+                # TODO: We need to add some kind of smoothing here, instead of
+                # setting negative probabilities to zero and normalizing.
+                negative_feature_probs[fval] = max(prob, 0.0)
+            feature_probdist[False, fname] = DictionaryProbDist(
+                negative_feature_probs, normalize=True
+            )
+
+        return PositiveNaiveBayesClassifier(label_probdist, feature_probdist)
+
+
+##//////////////////////////////////////////////////////
+##  Demo
+##//////////////////////////////////////////////////////
+
+
+def demo():
+    from nltk.classify.util import partial_names_demo
+
+    classifier = partial_names_demo(PositiveNaiveBayesClassifier.train)
+    classifier.show_most_informative_features()
--- a/backend/venv/Lib/site-packages/nltk/classify/rte_classify.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/rte_classify.py
@@ -0,0 +1,183 @@
+# Natural Language Toolkit: RTE Classifier
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Simple classifier for RTE corpus.
+
+It calculates the overlap in words and named entities between text and
+hypothesis, and also whether there are words / named entities in the
+hypothesis which fail to occur in the text, since this is an indicator that
+the hypothesis is more informative than (i.e not entailed by) the text.
+
+TO DO: better Named Entity classification
+TO DO: add lemmatization
+"""
+
+from nltk.classify.maxent import MaxentClassifier
+from nltk.classify.util import accuracy
+from nltk.tokenize import RegexpTokenizer
+
+
+class RTEFeatureExtractor:
+    """
+    This builds a bag of words for both the text and the hypothesis after
+    throwing away some stopwords, then calculates overlap and difference.
+    """
+
+    def __init__(self, rtepair, stop=True, use_lemmatize=False):
+        """
+        :param rtepair: a ``RTEPair`` from which features should be extracted
+        :param stop: if ``True``, stopwords are thrown away.
+        :type stop: bool
+        """
+        self.stop = stop
+        self.stopwords = {
+            "a",
+            "the",
+            "it",
+            "they",
+            "of",
+            "in",
+            "to",
+            "is",
+            "have",
+            "are",
+            "were",
+            "and",
+            "very",
+            ".",
+            ",",
+        }
+
+        self.negwords = {"no", "not", "never", "failed", "rejected", "denied"}
+        # Try to tokenize so that abbreviations, monetary amounts, email
+        # addresses, URLs are single tokens.
+        tokenizer = RegexpTokenizer(r"[\w.@:/]+|\w+|\$[\d.]+")
+
+        # Get the set of word types for text and hypothesis
+        self.text_tokens = tokenizer.tokenize(rtepair.text)
+        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
+        self.text_words = set(self.text_tokens)
+        self.hyp_words = set(self.hyp_tokens)
+
+        if use_lemmatize:
+            self.text_words = {self._lemmatize(token) for token in self.text_tokens}
+            self.hyp_words = {self._lemmatize(token) for token in self.hyp_tokens}
+
+        if self.stop:
+            self.text_words = self.text_words - self.stopwords
+            self.hyp_words = self.hyp_words - self.stopwords
+
+        self._overlap = self.hyp_words & self.text_words
+        self._hyp_extra = self.hyp_words - self.text_words
+        self._txt_extra = self.text_words - self.hyp_words
+
+    def overlap(self, toktype, debug=False):
+        """
+        Compute the overlap between text and hypothesis.
+
+        :param toktype: distinguish Named Entities from ordinary words
+        :type toktype: 'ne' or 'word'
+        """
+        ne_overlap = {token for token in self._overlap if self._ne(token)}
+        if toktype == "ne":
+            if debug:
+                print("ne overlap", ne_overlap)
+            return ne_overlap
+        elif toktype == "word":
+            if debug:
+                print("word overlap", self._overlap - ne_overlap)
+            return self._overlap - ne_overlap
+        else:
+            raise ValueError("Type not recognized:'%s'" % toktype)
+
+    def hyp_extra(self, toktype, debug=True):
+        """
+        Compute the extraneous material in the hypothesis.
+
+        :param toktype: distinguish Named Entities from ordinary words
+        :type toktype: 'ne' or 'word'
+        """
+        ne_extra = {token for token in self._hyp_extra if self._ne(token)}
+        if toktype == "ne":
+            return ne_extra
+        elif toktype == "word":
+            return self._hyp_extra - ne_extra
+        else:
+            raise ValueError("Type not recognized: '%s'" % toktype)
+
+    @staticmethod
+    def _ne(token):
+        """
+        This just assumes that words in all caps or titles are
+        named entities.
+
+        :type token: str
+        """
+        if token.istitle() or token.isupper():
+            return True
+        return False
+
+    @staticmethod
+    def _lemmatize(word):
+        """
+        Use morphy from WordNet to find the base form of verbs.
+        """
+        from nltk.corpus import wordnet as wn
+
+        lemma = wn.morphy(word, pos=wn.VERB)
+        if lemma is not None:
+            return lemma
+        return word
+
+
+def rte_features(rtepair):
+    extractor = RTEFeatureExtractor(rtepair)
+    features = {}
+    features["alwayson"] = True
+    features["word_overlap"] = len(extractor.overlap("word"))
+    features["word_hyp_extra"] = len(extractor.hyp_extra("word"))
+    features["ne_overlap"] = len(extractor.overlap("ne"))
+    features["ne_hyp_extra"] = len(extractor.hyp_extra("ne"))
+    features["neg_txt"] = len(extractor.negwords & extractor.text_words)
+    features["neg_hyp"] = len(extractor.negwords & extractor.hyp_words)
+    return features
+
+
+def rte_featurize(rte_pairs):
+    return [(rte_features(pair), pair.value) for pair in rte_pairs]
+
+
+def rte_classifier(algorithm, sample_N=None):
+    from nltk.corpus import rte as rte_corpus
+
+    train_set = rte_corpus.pairs(["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"])
+    test_set = rte_corpus.pairs(["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"])
+
+    if sample_N is not None:
+        train_set = train_set[:sample_N]
+        test_set = test_set[:sample_N]
+
+    featurized_train_set = rte_featurize(train_set)
+    featurized_test_set = rte_featurize(test_set)
+
+    # Train the classifier
+    print("Training classifier...")
+    if algorithm in ["megam"]:  # MEGAM based algorithms.
+        clf = MaxentClassifier.train(featurized_train_set, algorithm)
+    elif algorithm in ["GIS", "IIS"]:  # Use default GIS/IIS MaxEnt algorithm
+        clf = MaxentClassifier.train(featurized_train_set, algorithm)
+    else:
+        err_msg = str(
+            "RTEClassifier only supports these algorithms:\n "
+            "'megam', 'GIS', 'IIS'.\n"
+        )
+        raise Exception(err_msg)
+    print("Testing classifier...")
+    acc = accuracy(clf, featurized_test_set)
+    print("Accuracy: %6.4f" % acc)
+    return clf
--- a/backend/venv/Lib/site-packages/nltk/classify/scikitlearn.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/scikitlearn.py
@@ -0,0 +1,143 @@
+# Natural Language Toolkit: Interface to scikit-learn classifiers
+#
+# Author: Lars Buitinck <L.J.Buitinck@uva.nl>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+scikit-learn (https://scikit-learn.org) is a machine learning library for
+Python. It supports many classification algorithms, including SVMs,
+Naive Bayes, logistic regression (MaxEnt) and decision trees.
+
+This package implements a wrapper around scikit-learn classifiers. To use this
+wrapper, construct a scikit-learn estimator object, then use that to construct
+a SklearnClassifier. E.g., to wrap a linear SVM with default settings:
+
+>>> from sklearn.svm import LinearSVC
+>>> from nltk.classify.scikitlearn import SklearnClassifier
+>>> classif = SklearnClassifier(LinearSVC())
+
+A scikit-learn classifier may include preprocessing steps when it's wrapped
+in a Pipeline object. The following constructs and wraps a Naive Bayes text
+classifier with tf-idf weighting and chi-square feature selection to get the
+best 1000 features:
+
+>>> from sklearn.feature_extraction.text import TfidfTransformer
+>>> from sklearn.feature_selection import SelectKBest, chi2
+>>> from sklearn.naive_bayes import MultinomialNB
+>>> from sklearn.pipeline import Pipeline
+>>> pipeline = Pipeline([('tfidf', TfidfTransformer()),
+...                      ('chi2', SelectKBest(chi2, k=1000)),
+...                      ('nb', MultinomialNB())])
+>>> classif = SklearnClassifier(pipeline)
+"""
+
+from nltk.classify.api import ClassifierI
+from nltk.probability import DictionaryProbDist
+
+try:
+    from sklearn.feature_extraction import DictVectorizer
+    from sklearn.preprocessing import LabelEncoder
+except ImportError:
+    pass
+
+__all__ = ["SklearnClassifier"]
+
+
+class SklearnClassifier(ClassifierI):
+    """Wrapper for scikit-learn classifiers."""
+
+    def __init__(self, estimator, dtype=float, sparse=True):
+        """
+        :param estimator: scikit-learn classifier object.
+
+        :param dtype: data type used when building feature array.
+            scikit-learn estimators work exclusively on numeric data. The
+            default value should be fine for almost all situations.
+
+        :param sparse: Whether to use sparse matrices internally.
+            The estimator must support these; not all scikit-learn classifiers
+            do (see their respective documentation and look for "sparse
+            matrix"). The default value is True, since most NLP problems
+            involve sparse feature sets. Setting this to False may take a
+            great amount of memory.
+        :type sparse: boolean.
+        """
+        self._clf = estimator
+        self._encoder = LabelEncoder()
+        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
+
+    def __repr__(self):
+        return "<SklearnClassifier(%r)>" % self._clf
+
+    def classify_many(self, featuresets):
+        """Classify a batch of samples.
+
+        :param featuresets: An iterable over featuresets, each a dict mapping
+            strings to either numbers, booleans or strings.
+        :return: The predicted class label for each input sample.
+        :rtype: list
+        """
+        X = self._vectorizer.transform(featuresets)
+        classes = self._encoder.classes_
+        return [classes[i] for i in self._clf.predict(X)]
+
+    def prob_classify_many(self, featuresets):
+        """Compute per-class probabilities for a batch of samples.
+
+        :param featuresets: An iterable over featuresets, each a dict mapping
+            strings to either numbers, booleans or strings.
+        :rtype: list of ``ProbDistI``
+        """
+        X = self._vectorizer.transform(featuresets)
+        y_proba_list = self._clf.predict_proba(X)
+        return [self._make_probdist(y_proba) for y_proba in y_proba_list]
+
+    def labels(self):
+        """The class labels used by this classifier.
+
+        :rtype: list
+        """
+        return list(self._encoder.classes_)
+
+    def train(self, labeled_featuresets):
+        """
+        Train (fit) the scikit-learn estimator.
+
+        :param labeled_featuresets: A list of ``(featureset, label)``
+            where each ``featureset`` is a dict mapping strings to either
+            numbers, booleans or strings.
+        """
+
+        X, y = list(zip(*labeled_featuresets))
+        X = self._vectorizer.fit_transform(X)
+        y = self._encoder.fit_transform(y)
+        self._clf.fit(X, y)
+
+        return self
+
+    def _make_probdist(self, y_proba):
+        classes = self._encoder.classes_
+        return DictionaryProbDist({classes[i]: p for i, p in enumerate(y_proba)})
+
+
+if __name__ == "__main__":
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.naive_bayes import BernoulliNB
+
+    from nltk.classify.util import names_demo, names_demo_features
+
+    # Bernoulli Naive Bayes is designed for binary classification. We set the
+    # binarize option to False since we know we're passing boolean features.
+    print("scikit-learn Naive Bayes:")
+    names_demo(
+        SklearnClassifier(BernoulliNB(binarize=False)).train,
+        features=names_demo_features,
+    )
+
+    # The C parameter on logistic regression (MaxEnt) controls regularization.
+    # The higher it's set, the less regularized the classifier is.
+    print("\n\nscikit-learn logistic regression:")
+    names_demo(
+        SklearnClassifier(LogisticRegression(C=1000)).train,
+        features=names_demo_features,
+    )
--- a/backend/venv/Lib/site-packages/nltk/classify/senna.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/senna.py
@@ -0,0 +1,175 @@
+# Natural Language Toolkit: Senna Interface
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A general interface to the SENNA pipeline that supports any of the
+operations specified in SUPPORTED_OPERATIONS.
+
+Applying multiple operations at once has the speed advantage. For example,
+Senna will automatically determine POS tags if you are extracting named
+entities. Applying both of the operations will cost only the time of
+extracting the named entities.
+
+The SENNA pipeline has a fixed maximum size of the sentences that it can read.
+By default it is 1024 token/sentence. If you have larger sentences, changing
+the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
+system specific binary should be rebuilt. Otherwise this could introduce
+misalignment errors.
+
+The input is:
+
+- path to the directory that contains SENNA executables. If the path is incorrect,
+  Senna will automatically search for executable file specified in SENNA environment variable
+- List of the operations needed to be performed.
+- (optionally) the encoding of the input data (default:utf-8)
+
+Note: Unit tests for this module can be found in test/unit/test_senna.py
+
+>>> from nltk.classify import Senna
+>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])  # doctest: +SKIP
+>>> sent = 'Dusseldorf is an international business center'.split()
+>>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)]  # doctest: +SKIP
+[('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
+('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
+"""
+
+from os import environ, path, sep
+from platform import architecture, system
+from subprocess import PIPE, Popen
+
+from nltk.tag.api import TaggerI
+
+
+class Senna(TaggerI):
+    SUPPORTED_OPERATIONS = ["pos", "chk", "ner"]
+
+    def __init__(self, senna_path, operations, encoding="utf-8"):
+        self._encoding = encoding
+        self._path = path.normpath(senna_path) + sep
+
+        # Verifies the existence of the executable on the self._path first
+        # senna_binary_file_1 = self.executable(self._path)
+        exe_file_1 = self.executable(self._path)
+        if not path.isfile(exe_file_1):
+            # Check for the system environment
+            if "SENNA" in environ:
+                # self._path = path.join(environ['SENNA'],'')
+                self._path = path.normpath(environ["SENNA"]) + sep
+                exe_file_2 = self.executable(self._path)
+                if not path.isfile(exe_file_2):
+                    raise LookupError(
+                        "Senna executable expected at %s or %s but not found"
+                        % (exe_file_1, exe_file_2)
+                    )
+
+        self.operations = operations
+
+    def executable(self, base_path):
+        """
+        The function that determines the system specific binary that should be
+        used in the pipeline. In case, the system is not known the default senna binary will
+        be used.
+        """
+        os_name = system()
+        if os_name == "Linux":
+            bits = architecture()[0]
+            if bits == "64bit":
+                return path.join(base_path, "senna-linux64")
+            return path.join(base_path, "senna-linux32")
+        if os_name == "Windows":
+            return path.join(base_path, "senna-win32.exe")
+        if os_name == "Darwin":
+            return path.join(base_path, "senna-osx")
+        return path.join(base_path, "senna")
+
+    def _map(self):
+        """
+        A method that calculates the order of the columns that SENNA pipeline
+        will output the tags into. This depends on the operations being ordered.
+        """
+        _map = {}
+        i = 1
+        for operation in Senna.SUPPORTED_OPERATIONS:
+            if operation in self.operations:
+                _map[operation] = i
+                i += 1
+        return _map
+
+    def tag(self, tokens):
+        """
+        Applies the specified operation(s) on a list of tokens.
+        """
+        return self.tag_sents([tokens])[0]
+
+    def tag_sents(self, sentences):
+        """
+        Applies the tag method over a list of sentences. This method will return a
+        list of dictionaries. Every dictionary will contain a word with its
+        calculated annotations/tags.
+        """
+        encoding = self._encoding
+
+        if not path.isfile(self.executable(self._path)):
+            raise LookupError(
+                "Senna executable expected at %s but not found"
+                % self.executable(self._path)
+            )
+
+        # Build the senna command to run the tagger
+        _senna_cmd = [
+            self.executable(self._path),
+            "-path",
+            self._path,
+            "-usrtokens",
+            "-iobtags",
+        ]
+        _senna_cmd.extend(["-" + op for op in self.operations])
+
+        # Serialize the actual sentences to a temporary string
+        _input = "\n".join(" ".join(x) for x in sentences) + "\n"
+        if isinstance(_input, str) and encoding:
+            _input = _input.encode(encoding)
+
+        # Run the tagger and get the output
+        p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
+        (stdout, stderr) = p.communicate(input=_input)
+        senna_output = stdout
+
+        # Check the return code.
+        if p.returncode != 0:
+            raise RuntimeError("Senna command failed! Details: %s" % stderr)
+
+        if encoding:
+            senna_output = stdout.decode(encoding)
+
+        # Output the tagged sentences
+        map_ = self._map()
+        tagged_sentences = [[]]
+        sentence_index = 0
+        token_index = 0
+        for tagged_word in senna_output.strip().split("\n"):
+            if not tagged_word:
+                tagged_sentences.append([])
+                sentence_index += 1
+                token_index = 0
+                continue
+            tags = tagged_word.split("\t")
+            result = {}
+            for tag in map_:
+                result[tag] = tags[map_[tag]].strip()
+            try:
+                result["word"] = sentences[sentence_index][token_index]
+            except IndexError as e:
+                raise IndexError(
+                    "Misalignment error occurred at sentence number %d. Possible reason"
+                    " is that the sentence size exceeded the maximum size. Check the "
+                    "documentation of Senna class for more information."
+                    % sentence_index
+                ) from e
+            tagged_sentences[-1].append(result)
+            token_index += 1
+        return tagged_sentences
--- a/backend/venv/Lib/site-packages/nltk/classify/svm.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/svm.py
@@ -0,0 +1,17 @@
+# Natural Language Toolkit: SVM-based classifier
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Leon Derczynski <leon@dcs.shef.ac.uk>
+#
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+nltk.classify.svm was deprecated. For classification based
+on support vector machines SVMs use nltk.classify.scikitlearn
+(or `scikit-learn <https://scikit-learn.org>`_ directly).
+"""
+
+
+class SvmClassifier:
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError(__doc__)
--- a/backend/venv/Lib/site-packages/nltk/classify/tadm.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/tadm.py
@@ -0,0 +1,122 @@
+# Natural Language Toolkit: Interface to TADM Classifier
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Joseph Frazee <jfrazee@mail.utexas.edu>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import subprocess
+import sys
+
+from nltk.internals import find_binary
+
+try:
+    import numpy
+except ImportError:
+    pass
+
+_tadm_bin = None
+
+
+def config_tadm(bin=None):
+    global _tadm_bin
+    _tadm_bin = find_binary(
+        "tadm", bin, env_vars=["TADM"], binary_names=["tadm"], url="http://tadm.sf.net"
+    )
+
+
+def write_tadm_file(train_toks, encoding, stream):
+    """
+    Generate an input file for ``tadm`` based on the given corpus of
+    classified tokens.
+
+    :type train_toks: list(tuple(dict, str))
+    :param train_toks: Training data, represented as a list of
+        pairs, the first member of which is a feature dictionary,
+        and the second of which is a classification label.
+    :type encoding: TadmEventMaxentFeatureEncoding
+    :param encoding: A feature encoding, used to convert featuresets
+        into feature vectors.
+    :type stream: stream
+    :param stream: The stream to which the ``tadm`` input file should be
+        written.
+    """
+    # See the following for a file format description:
+    #
+    # https://sf.net/forum/forum.php?thread_id=1391502&forum_id=473054
+    # https://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054
+    labels = encoding.labels()
+    for featureset, label in train_toks:
+        length_line = "%d\n" % len(labels)
+        stream.write(length_line)
+        for known_label in labels:
+            v = encoding.encode(featureset, known_label)
+            line = "%d %d %s\n" % (
+                int(label == known_label),
+                len(v),
+                " ".join("%d %d" % u for u in v),
+            )
+            stream.write(line)
+
+
+def parse_tadm_weights(paramfile):
+    """
+    Given the stdout output generated by ``tadm`` when training a
+    model, return a ``numpy`` array containing the corresponding weight
+    vector.
+    """
+    weights = []
+    for line in paramfile:
+        weights.append(float(line.strip()))
+    return numpy.array(weights, "d")
+
+
+def call_tadm(args):
+    """
+    Call the ``tadm`` binary with the given arguments.
+    """
+    if isinstance(args, str):
+        raise TypeError("args should be a list of strings")
+    if _tadm_bin is None:
+        config_tadm()
+
+    # Call tadm via a subprocess
+    cmd = [_tadm_bin] + args
+    p = subprocess.Popen(cmd, stdout=sys.stdout)
+    (stdout, stderr) = p.communicate()
+
+    # Check the return code.
+    if p.returncode != 0:
+        print()
+        print(stderr)
+        raise OSError("tadm command failed!")
+
+
+def names_demo():
+    from nltk.classify.maxent import TadmMaxentClassifier
+    from nltk.classify.util import names_demo
+
+    classifier = names_demo(TadmMaxentClassifier.train)
+
+
+def encoding_demo():
+    import sys
+
+    from nltk.classify.maxent import TadmEventMaxentFeatureEncoding
+
+    tokens = [
+        ({"f0": 1, "f1": 1, "f3": 1}, "A"),
+        ({"f0": 1, "f2": 1, "f4": 1}, "B"),
+        ({"f0": 2, "f2": 1, "f3": 1, "f4": 1}, "A"),
+    ]
+    encoding = TadmEventMaxentFeatureEncoding.train(tokens)
+    write_tadm_file(tokens, encoding, sys.stdout)
+    print()
+    for i in range(encoding.length()):
+        print("%s --> %d" % (encoding.describe(i), i))
+    print()
+
+
+if __name__ == "__main__":
+    encoding_demo()
+    names_demo()
--- a/backend/venv/Lib/site-packages/nltk/classify/textcat.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/textcat.py
@@ -0,0 +1,193 @@
+# Natural Language Toolkit: Language ID module using TextCat algorithm
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Avital Pekker <avital.pekker@utoronto.ca>
+#
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A module for language identification using the TextCat algorithm.
+An implementation of the text categorization algorithm
+presented in Cavnar, W. B. and J. M. Trenkle,
+"N-Gram-Based Text Categorization".
+
+The algorithm takes advantage of Zipf's law and uses
+n-gram frequencies to profile languages and text-yet to
+be identified-then compares using a distance measure.
+
+Language n-grams are provided by the "An Crubadan"
+project. A corpus reader was created separately to read
+those files.
+
+For details regarding the algorithm, see:
+https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
+
+For details about An Crubadan, see:
+https://borel.slu.edu/crubadan/index.html
+"""
+
+from sys import maxsize
+
+from nltk.util import trigrams
+
+# Note: this is NOT "re" you're likely used to. The regex module
+# is an alternative to the standard re module that supports
+# Unicode codepoint properties with the \p{} syntax.
+# You may have to "pip install regx"
+try:
+    import regex as re
+except ImportError:
+    re = None
+######################################################################
+##  Language identification using TextCat
+######################################################################
+
+
+class TextCat:
+    _corpus = None
+    fingerprints = {}
+    _START_CHAR = "<"
+    _END_CHAR = ">"
+
+    last_distances = {}
+
+    def __init__(self):
+        if not re:
+            raise OSError(
+                "classify.textcat requires the regex module that "
+                "supports unicode. Try '$ pip install regex' and "
+                "see https://pypi.python.org/pypi/regex for "
+                "further details."
+            )
+
+        from nltk.corpus import crubadan
+
+        self._corpus = crubadan
+        # Load all language ngrams into cache
+        for lang in self._corpus.langs():
+            self._corpus.lang_freq(lang)
+
+    def remove_punctuation(self, text):
+        """Get rid of punctuation except apostrophes"""
+        return re.sub(r"[^\P{P}\']+", "", text)
+
+    def profile(self, text):
+        """Create FreqDist of trigrams within text"""
+        from nltk import FreqDist, word_tokenize
+
+        clean_text = self.remove_punctuation(text)
+        tokens = word_tokenize(clean_text)
+
+        fingerprint = FreqDist()
+        for t in tokens:
+            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
+            token_trigrams = ["".join(tri) for tri in token_trigram_tuples]
+
+            for cur_trigram in token_trigrams:
+                if cur_trigram in fingerprint:
+                    fingerprint[cur_trigram] += 1
+                else:
+                    fingerprint[cur_trigram] = 1
+
+        return fingerprint
+
+    def calc_dist(self, lang, trigram, text_profile):
+        """Calculate the "out-of-place" measure between the
+        text and language profile for a single trigram"""
+
+        lang_fd = self._corpus.lang_freq(lang)
+        dist = 0
+
+        if trigram in lang_fd:
+            idx_lang_profile = list(lang_fd.keys()).index(trigram)
+            idx_text = list(text_profile.keys()).index(trigram)
+
+            # print(idx_lang_profile, ", ", idx_text)
+            dist = abs(idx_lang_profile - idx_text)
+        else:
+            # Arbitrary but should be larger than
+            # any possible trigram file length
+            # in terms of total lines
+            dist = maxsize
+
+        return dist
+
+    def lang_dists(self, text):
+        """Calculate the "out-of-place" measure between
+        the text and all languages"""
+
+        distances = {}
+        profile = self.profile(text)
+        # For all the languages
+        for lang in self._corpus._all_lang_freq.keys():
+            # Calculate distance metric for every trigram in
+            # input text to be identified
+            lang_dist = 0
+            for trigram in profile:
+                lang_dist += self.calc_dist(lang, trigram, profile)
+
+            distances[lang] = lang_dist
+
+        return distances
+
+    def guess_language(self, text):
+        """Find the language with the min distance
+        to the text and return its ISO 639-3 code"""
+        self.last_distances = self.lang_dists(text)
+
+        return min(self.last_distances, key=self.last_distances.get)
+        #################################################')
+
+
+def demo():
+    from nltk.corpus import udhr
+
+    langs = [
+        "Kurdish-UTF8",
+        "Abkhaz-UTF8",
+        "Farsi_Persian-UTF8",
+        "Hindi-UTF8",
+        "Hawaiian-UTF8",
+        "Russian-UTF8",
+        "Vietnamese-UTF8",
+        "Serbian_Srpski-UTF8",
+        "Esperanto-UTF8",
+    ]
+
+    friendly = {
+        "kmr": "Northern Kurdish",
+        "abk": "Abkhazian",
+        "pes": "Iranian Persian",
+        "hin": "Hindi",
+        "haw": "Hawaiian",
+        "rus": "Russian",
+        "vie": "Vietnamese",
+        "srp": "Serbian",
+        "epo": "Esperanto",
+    }
+
+    tc = TextCat()
+
+    for cur_lang in langs:
+        # Get raw data from UDHR corpus
+        raw_sentences = udhr.sents(cur_lang)
+        rows = len(raw_sentences) - 1
+        cols = list(map(len, raw_sentences))
+
+        sample = ""
+
+        # Generate a sample text of the language
+        for i in range(0, rows):
+            cur_sent = " " + " ".join([raw_sentences[i][j] for j in range(0, cols[i])])
+            sample += cur_sent
+
+        # Try to detect what it is
+        print("Language snippet: " + sample[0:140] + "...")
+        guess = tc.guess_language(sample)
+        print(f"Language detection: {guess} ({friendly[guess]})")
+        print("#" * 140)
+
+
+if __name__ == "__main__":
+    demo()
--- a/backend/venv/Lib/site-packages/nltk/classify/util.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/util.py
@@ -0,0 +1,347 @@
+# Natural Language Toolkit: Classifier Utility Functions
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Utility functions and classes for classifiers.
+"""
+
+import math
+
+# from nltk.util import Deprecated
+import nltk.classify.util  # for accuracy & log_likelihood
+from nltk.util import LazyMap
+
+######################################################################
+# { Helper Functions
+######################################################################
+
+
+# alternative name possibility: 'map_featurefunc()'?
+# alternative name possibility: 'detect_features()'?
+# alternative name possibility: 'map_featuredetect()'?
+# or.. just have users use LazyMap directly?
+def apply_features(feature_func, toks, labeled=None):
+    """
+    Use the ``LazyMap`` class to construct a lazy list-like
+    object that is analogous to ``map(feature_func, toks)``.  In
+    particular, if ``labeled=False``, then the returned list-like
+    object's values are equal to::
+
+        [feature_func(tok) for tok in toks]
+
+    If ``labeled=True``, then the returned list-like object's values
+    are equal to::
+
+        [(feature_func(tok), label) for (tok, label) in toks]
+
+    The primary purpose of this function is to avoid the memory
+    overhead involved in storing all the featuresets for every token
+    in a corpus.  Instead, these featuresets are constructed lazily,
+    as-needed.  The reduction in memory overhead can be especially
+    significant when the underlying list of tokens is itself lazy (as
+    is the case with many corpus readers).
+
+    :param feature_func: The function that will be applied to each
+        token.  It should return a featureset -- i.e., a dict
+        mapping feature names to feature values.
+    :param toks: The list of tokens to which ``feature_func`` should be
+        applied.  If ``labeled=True``, then the list elements will be
+        passed directly to ``feature_func()``.  If ``labeled=False``,
+        then the list elements should be tuples ``(tok,label)``, and
+        ``tok`` will be passed to ``feature_func()``.
+    :param labeled: If true, then ``toks`` contains labeled tokens --
+        i.e., tuples of the form ``(tok, label)``.  (Default:
+        auto-detect based on types.)
+    """
+    if labeled is None:
+        labeled = toks and isinstance(toks[0], (tuple, list))
+    if labeled:
+
+        def lazy_func(labeled_token):
+            return (feature_func(labeled_token[0]), labeled_token[1])
+
+        return LazyMap(lazy_func, toks)
+    else:
+        return LazyMap(feature_func, toks)
+
+
+def attested_labels(tokens):
+    """
+    :return: A list of all labels that are attested in the given list
+        of tokens.
+    :rtype: list of (immutable)
+    :param tokens: The list of classified tokens from which to extract
+        labels.  A classified token has the form ``(token, label)``.
+    :type tokens: list
+    """
+    return tuple({label for (tok, label) in tokens})
+
+
+def log_likelihood(classifier, gold):
+    results = classifier.prob_classify_many([fs for (fs, l) in gold])
+    ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
+    return math.log(sum(ll) / len(ll))
+
+
+def accuracy(classifier, gold):
+    results = classifier.classify_many([fs for (fs, l) in gold])
+    correct = [l == r for ((fs, l), r) in zip(gold, results)]
+    if correct:
+        return sum(correct) / len(correct)
+    else:
+        return 0
+
+
+class CutoffChecker:
+    """
+    A helper class that implements cutoff checks based on number of
+    iterations and log likelihood.
+
+    Accuracy cutoffs are also implemented, but they're almost never
+    a good idea to use.
+    """
+
+    def __init__(self, cutoffs):
+        self.cutoffs = cutoffs.copy()
+        if "min_ll" in cutoffs:
+            cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
+        if "min_lldelta" in cutoffs:
+            cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
+        self.ll = None
+        self.acc = None
+        self.iter = 1
+
+    def check(self, classifier, train_toks):
+        cutoffs = self.cutoffs
+        self.iter += 1
+        if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
+            return True  # iteration cutoff.
+
+        new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
+        if math.isnan(new_ll):
+            return True
+
+        if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
+            if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
+                return True  # log likelihood cutoff
+            if (
+                "min_lldelta" in cutoffs
+                and self.ll
+                and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
+            ):
+                return True  # log likelihood delta cutoff
+            self.ll = new_ll
+
+        if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
+            new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
+            if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
+                return True  # log likelihood cutoff
+            if (
+                "min_accdelta" in cutoffs
+                and self.acc
+                and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
+            ):
+                return True  # log likelihood delta cutoff
+            self.acc = new_acc
+
+            return False  # no cutoff reached.
+
+
+######################################################################
+# { Demos
+######################################################################
+
+
+def names_demo_features(name):
+    features = {}
+    features["alwayson"] = True
+    features["startswith"] = name[0].lower()
+    features["endswith"] = name[-1].lower()
+    for letter in "abcdefghijklmnopqrstuvwxyz":
+        features["count(%s)" % letter] = name.lower().count(letter)
+        features["has(%s)" % letter] = letter in name.lower()
+    return features
+
+
+def binary_names_demo_features(name):
+    features = {}
+    features["alwayson"] = True
+    features["startswith(vowel)"] = name[0].lower() in "aeiouy"
+    features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
+    for letter in "abcdefghijklmnopqrstuvwxyz":
+        features["count(%s)" % letter] = name.lower().count(letter)
+        features["has(%s)" % letter] = letter in name.lower()
+        features["startswith(%s)" % letter] = letter == name[0].lower()
+        features["endswith(%s)" % letter] = letter == name[-1].lower()
+    return features
+
+
+def names_demo(trainer, features=names_demo_features):
+    import random
+
+    from nltk.corpus import names
+
+    # Construct a list of classified names, using the names corpus.
+    namelist = [(name, "male") for name in names.words("male.txt")] + [
+        (name, "female") for name in names.words("female.txt")
+    ]
+
+    # Randomly split the names into a test & train set.
+    random.seed(123456)
+    random.shuffle(namelist)
+    train = namelist[:5000]
+    test = namelist[5000:5500]
+
+    # Train up a classifier.
+    print("Training classifier...")
+    classifier = trainer([(features(n), g) for (n, g) in train])
+
+    # Run the classifier on the test data.
+    print("Testing classifier...")
+    acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
+    print("Accuracy: %6.4f" % acc)
+
+    # For classifiers that can find probabilities, show the log
+    # likelihood and some sample probability distributions.
+    try:
+        test_featuresets = [features(n) for (n, g) in test]
+        pdists = classifier.prob_classify_many(test_featuresets)
+        ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
+        print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+        print()
+        print("Unseen Names      P(Male)  P(Female)\n" + "-" * 40)
+        for (name, gender), pdist in list(zip(test, pdists))[:5]:
+            if gender == "male":
+                fmt = "  %-15s *%6.4f   %6.4f"
+            else:
+                fmt = "  %-15s  %6.4f  *%6.4f"
+            print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
+    except NotImplementedError:
+        pass
+
+    # Return the classifier
+    return classifier
+
+
+def partial_names_demo(trainer, features=names_demo_features):
+    import random
+
+    from nltk.corpus import names
+
+    male_names = names.words("male.txt")
+    female_names = names.words("female.txt")
+
+    random.seed(654321)
+    random.shuffle(male_names)
+    random.shuffle(female_names)
+
+    # Create a list of male names to be used as positive-labeled examples for training
+    positive = map(features, male_names[:2000])
+
+    # Create a list of male and female names to be used as unlabeled examples
+    unlabeled = map(features, male_names[2000:2500] + female_names[:500])
+
+    # Create a test set with correctly-labeled male and female names
+    test = [(name, True) for name in male_names[2500:2750]] + [
+        (name, False) for name in female_names[500:750]
+    ]
+
+    random.shuffle(test)
+
+    # Train up a classifier.
+    print("Training classifier...")
+    classifier = trainer(positive, unlabeled)
+
+    # Run the classifier on the test data.
+    print("Testing classifier...")
+    acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
+    print("Accuracy: %6.4f" % acc)
+
+    # For classifiers that can find probabilities, show the log
+    # likelihood and some sample probability distributions.
+    try:
+        test_featuresets = [features(n) for (n, m) in test]
+        pdists = classifier.prob_classify_many(test_featuresets)
+        ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
+        print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+        print()
+        print("Unseen Names      P(Male)  P(Female)\n" + "-" * 40)
+        for (name, is_male), pdist in zip(test, pdists)[:5]:
+            if is_male == True:
+                fmt = "  %-15s *%6.4f   %6.4f"
+            else:
+                fmt = "  %-15s  %6.4f  *%6.4f"
+            print(fmt % (name, pdist.prob(True), pdist.prob(False)))
+    except NotImplementedError:
+        pass
+
+    # Return the classifier
+    return classifier
+
+
+_inst_cache = {}
+
+
+def wsd_demo(trainer, word, features, n=1000):
+    import random
+
+    from nltk.corpus import senseval
+
+    # Get the instances.
+    print("Reading data...")
+    global _inst_cache
+    if word not in _inst_cache:
+        _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
+    instances = _inst_cache[word][:]
+    if n > len(instances):
+        n = len(instances)
+    senses = list({l for (i, l) in instances})
+    print("  Senses: " + " ".join(senses))
+
+    # Randomly split the names into a test & train set.
+    print("Splitting into test & train...")
+    random.seed(123456)
+    random.shuffle(instances)
+    train = instances[: int(0.8 * n)]
+    test = instances[int(0.8 * n) : n]
+
+    # Train up a classifier.
+    print("Training classifier...")
+    classifier = trainer([(features(i), l) for (i, l) in train])
+
+    # Run the classifier on the test data.
+    print("Testing classifier...")
+    acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
+    print("Accuracy: %6.4f" % acc)
+
+    # For classifiers that can find probabilities, show the log
+    # likelihood and some sample probability distributions.
+    try:
+        test_featuresets = [features(i) for (i, n) in test]
+        pdists = classifier.prob_classify_many(test_featuresets)
+        ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
+        print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+    except NotImplementedError:
+        pass
+
+    # Return the classifier
+    return classifier
+
+
+def check_megam_config():
+    """
+    Checks whether the MEGAM binary is configured.
+    """
+    try:
+        _megam_bin
+    except NameError as e:
+        err_msg = str(
+            "Please configure your megam binary first, e.g.\n"
+            ">>> nltk.config_megam('/usr/bin/local/megam')"
+        )
+        raise NameError(err_msg) from e
--- a/backend/venv/Lib/site-packages/nltk/classify/weka.py
+++ b/backend/venv/Lib/site-packages/nltk/classify/weka.py
@@ -0,0 +1,377 @@
+# Natural Language Toolkit: Interface to Weka Classsifiers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Classifiers that make use of the external 'Weka' package.
+"""
+
+import os
+import re
+import subprocess
+import tempfile
+import time
+import zipfile
+from sys import stdin
+
+from nltk.classify.api import ClassifierI
+from nltk.internals import config_java, java
+from nltk.probability import DictionaryProbDist
+
+_weka_classpath = None
+_weka_search = [
+    ".",
+    "/usr/share/weka",
+    "/usr/local/share/weka",
+    "/usr/lib/weka",
+    "/usr/local/lib/weka",
+]
+
+
+def config_weka(classpath=None):
+    global _weka_classpath
+
+    # Make sure java's configured first.
+    config_java()
+
+    if classpath is not None:
+        _weka_classpath = classpath
+
+    if _weka_classpath is None:
+        searchpath = _weka_search
+        if "WEKAHOME" in os.environ:
+            searchpath.insert(0, os.environ["WEKAHOME"])
+
+        for path in searchpath:
+            if os.path.exists(os.path.join(path, "weka.jar")):
+                _weka_classpath = os.path.join(path, "weka.jar")
+                version = _check_weka_version(_weka_classpath)
+                if version:
+                    print(f"[Found Weka: {_weka_classpath} (version {version})]")
+                else:
+                    print("[Found Weka: %s]" % _weka_classpath)
+                _check_weka_version(_weka_classpath)
+
+    if _weka_classpath is None:
+        raise LookupError(
+            "Unable to find weka.jar!  Use config_weka() "
+            "or set the WEKAHOME environment variable. "
+            "For more information about Weka, please see "
+            "https://www.cs.waikato.ac.nz/ml/weka/"
+        )
+
+
+def _check_weka_version(jar):
+    try:
+        zf = zipfile.ZipFile(jar)
+    except (SystemExit, KeyboardInterrupt):
+        raise
+    except:
+        return None
+    try:
+        try:
+            return zf.read("weka/core/version.txt")
+        except KeyError:
+            return None
+    finally:
+        zf.close()
+
+
+class WekaClassifier(ClassifierI):
+    def __init__(self, formatter, model_filename):
+        self._formatter = formatter
+        self._model = model_filename
+
+    def prob_classify_many(self, featuresets):
+        return self._classify_many(featuresets, ["-p", "0", "-distribution"])
+
+    def classify_many(self, featuresets):
+        return self._classify_many(featuresets, ["-p", "0"])
+
+    def _classify_many(self, featuresets, options):
+        # Make sure we can find java & weka.
+        config_weka()
+
+        temp_dir = tempfile.mkdtemp()
+        try:
+            # Write the test data file.
+            test_filename = os.path.join(temp_dir, "test.arff")
+            self._formatter.write(test_filename, featuresets)
+
+            # Call weka to classify the data.
+            cmd = [
+                "weka.classifiers.bayes.NaiveBayes",
+                "-l",
+                self._model,
+                "-T",
+                test_filename,
+            ] + options
+            (stdout, stderr) = java(
+                cmd,
+                classpath=_weka_classpath,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+
+            # Check if something went wrong:
+            if stderr and not stdout:
+                if "Illegal options: -distribution" in stderr:
+                    raise ValueError(
+                        "The installed version of weka does "
+                        "not support probability distribution "
+                        "output."
+                    )
+                else:
+                    raise ValueError("Weka failed to generate output:\n%s" % stderr)
+
+            # Parse weka's output.
+            return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n"))
+
+        finally:
+            for f in os.listdir(temp_dir):
+                os.remove(os.path.join(temp_dir, f))
+            os.rmdir(temp_dir)
+
+    def parse_weka_distribution(self, s):
+        probs = [float(v) for v in re.split("[*,]+", s) if v.strip()]
+        probs = dict(zip(self._formatter.labels(), probs))
+        return DictionaryProbDist(probs)
+
+    def parse_weka_output(self, lines):
+        # Strip unwanted text from stdout
+        for i, line in enumerate(lines):
+            if line.strip().startswith("inst#"):
+                lines = lines[i:]
+                break
+
+        if lines[0].split() == ["inst#", "actual", "predicted", "error", "prediction"]:
+            return [line.split()[2].split(":")[1] for line in lines[1:] if line.strip()]
+        elif lines[0].split() == [
+            "inst#",
+            "actual",
+            "predicted",
+            "error",
+            "distribution",
+        ]:
+            return [
+                self.parse_weka_distribution(line.split()[-1])
+                for line in lines[1:]
+                if line.strip()
+            ]
+
+        # is this safe:?
+        elif re.match(r"^0 \w+ [01]\.[0-9]* \?\s*$", lines[0]):
+            return [line.split()[1] for line in lines if line.strip()]
+
+        else:
+            for line in lines[:10]:
+                print(line)
+            raise ValueError(
+                "Unhandled output format -- your version "
+                "of weka may not be supported.\n"
+                "  Header: %s" % lines[0]
+            )
+
+    # [xx] full list of classifiers (some may be abstract?):
+    # ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule,
+    # DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48,
+    # JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic,
+    # LogisticBase, M5Base, MultilayerPerceptron,
+    # MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial,
+    # NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART,
+    # PreConstructedLinearModel, Prism, RandomForest,
+    # RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor,
+    # RuleNode, SimpleLinearRegression, SimpleLogistic,
+    # SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI,
+    # VotedPerceptron, Winnow, ZeroR
+
+    _CLASSIFIER_CLASS = {
+        "naivebayes": "weka.classifiers.bayes.NaiveBayes",
+        "C4.5": "weka.classifiers.trees.J48",
+        "log_regression": "weka.classifiers.functions.Logistic",
+        "svm": "weka.classifiers.functions.SMO",
+        "kstar": "weka.classifiers.lazy.KStar",
+        "ripper": "weka.classifiers.rules.JRip",
+    }
+
+    @classmethod
+    def train(
+        cls,
+        model_filename,
+        featuresets,
+        classifier="naivebayes",
+        options=[],
+        quiet=True,
+    ):
+        # Make sure we can find java & weka.
+        config_weka()
+
+        # Build an ARFF formatter.
+        formatter = ARFF_Formatter.from_train(featuresets)
+
+        temp_dir = tempfile.mkdtemp()
+        try:
+            # Write the training data file.
+            train_filename = os.path.join(temp_dir, "train.arff")
+            formatter.write(train_filename, featuresets)
+
+            if classifier in cls._CLASSIFIER_CLASS:
+                javaclass = cls._CLASSIFIER_CLASS[classifier]
+            elif classifier in cls._CLASSIFIER_CLASS.values():
+                javaclass = classifier
+            else:
+                raise ValueError("Unknown classifier %s" % classifier)
+
+            # Train the weka model.
+            cmd = [javaclass, "-d", model_filename, "-t", train_filename]
+            cmd += list(options)
+            if quiet:
+                stdout = subprocess.PIPE
+            else:
+                stdout = None
+            java(cmd, classpath=_weka_classpath, stdout=stdout)
+
+            # Return the new classifier.
+            return WekaClassifier(formatter, model_filename)
+
+        finally:
+            for f in os.listdir(temp_dir):
+                os.remove(os.path.join(temp_dir, f))
+            os.rmdir(temp_dir)
+
+
+class ARFF_Formatter:
+    """
+    Converts featuresets and labeled featuresets to ARFF-formatted
+    strings, appropriate for input into Weka.
+
+    Features and classes can be specified manually in the constructor, or may
+    be determined from data using ``from_train``.
+    """
+
+    def __init__(self, labels, features):
+        """
+        :param labels: A list of all class labels that can be generated.
+        :param features: A list of feature specifications, where
+            each feature specification is a tuple (fname, ftype);
+            and ftype is an ARFF type string such as NUMERIC or
+            STRING.
+        """
+        self._labels = labels
+        self._features = features
+
+    def format(self, tokens):
+        """Returns a string representation of ARFF output for the given data."""
+        return self.header_section() + self.data_section(tokens)
+
+    def labels(self):
+        """Returns the list of classes."""
+        return list(self._labels)
+
+    def write(self, outfile, tokens):
+        """Writes ARFF data to a file for the given data."""
+        if not hasattr(outfile, "write"):
+            outfile = open(outfile, "w")
+        outfile.write(self.format(tokens))
+        outfile.close()
+
+    @staticmethod
+    def from_train(tokens):
+        """
+        Constructs an ARFF_Formatter instance with class labels and feature
+        types determined from the given data. Handles boolean, numeric and
+        string (note: not nominal) types.
+        """
+        # Find the set of all attested labels.
+        labels = {label for (tok, label) in tokens}
+
+        # Determine the types of all features.
+        features = {}
+        for tok, label in tokens:
+            for fname, fval in tok.items():
+                if issubclass(type(fval), bool):
+                    ftype = "{True, False}"
+                elif issubclass(type(fval), (int, float, bool)):
+                    ftype = "NUMERIC"
+                elif issubclass(type(fval), str):
+                    ftype = "STRING"
+                elif fval is None:
+                    continue  # can't tell the type.
+                else:
+                    raise ValueError("Unsupported value type %r" % ftype)
+
+                if features.get(fname, ftype) != ftype:
+                    raise ValueError("Inconsistent type for %s" % fname)
+                features[fname] = ftype
+        features = sorted(features.items())
+
+        return ARFF_Formatter(labels, features)
+
+    def header_section(self):
+        """Returns an ARFF header as a string."""
+        # Header comment.
+        s = (
+            "% Weka ARFF file\n"
+            + "% Generated automatically by NLTK\n"
+            + "%% %s\n\n" % time.ctime()
+        )
+
+        # Relation name
+        s += "@RELATION rel\n\n"
+
+        # Input attribute specifications
+        for fname, ftype in self._features:
+            s += "@ATTRIBUTE %-30r %s\n" % (fname, ftype)
+
+        # Label attribute specification
+        s += "@ATTRIBUTE %-30r {%s}\n" % ("-label-", ",".join(self._labels))
+
+        return s
+
+    def data_section(self, tokens, labeled=None):
+        """
+        Returns the ARFF data section for the given data.
+
+        :param tokens: a list of featuresets (dicts) or labelled featuresets
+            which are tuples (featureset, label).
+        :param labeled: Indicates whether the given tokens are labeled
+            or not.  If None, then the tokens will be assumed to be
+            labeled if the first token's value is a tuple or list.
+        """
+        # Check if the tokens are labeled or unlabeled.  If unlabeled,
+        # then use 'None'
+        if labeled is None:
+            labeled = tokens and isinstance(tokens[0], (tuple, list))
+        if not labeled:
+            tokens = [(tok, None) for tok in tokens]
+
+        # Data section
+        s = "\n@DATA\n"
+        for tok, label in tokens:
+            for fname, ftype in self._features:
+                s += "%s," % self._fmt_arff_val(tok.get(fname))
+            s += "%s\n" % self._fmt_arff_val(label)
+
+        return s
+
+    def _fmt_arff_val(self, fval):
+        if fval is None:
+            return "?"
+        elif isinstance(fval, (bool, int)):
+            return "%s" % fval
+        elif isinstance(fval, float):
+            return "%r" % fval
+        else:
+            return "%r" % fval
+
+
+if __name__ == "__main__":
+    from nltk.classify.util import binary_names_demo_features, names_demo
+
+    def make_classifier(featuresets):
+        return WekaClassifier.train("/tmp/name.model", featuresets, "C4.5")
+
+    classifier = names_demo(make_classifier, binary_names_demo_features)