Initial commit

2026-02-01 09:31:38 +01:00
commit e02db93960
4396 changed files with 1511612 additions and 0 deletions
--- a/backend/venv/Lib/site-packages/nltk/lm/init.py
+++ b/backend/venv/Lib/site-packages/nltk/lm/init.py
@@ -0,0 +1,235 @@
+# Natural Language Toolkit: Language Models
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Authors: Ilia Kurenkov <ilia.kurenkov@gmail.com>
+# URL: <https://www.nltk.org/
+# For license information, see LICENSE.TXT
+"""
+NLTK Language Modeling Module.
+------------------------------
+
+Currently this module covers only ngram language models, but it should be easy
+to extend to neural models.
+
+
+Preparing Data
+==============
+
+Before we train our ngram models it is necessary to make sure the data we put in
+them is in the right format.
+Let's say we have a text that is a list of sentences, where each sentence is
+a list of strings. For simplicity we just consider a text consisting of
+characters instead of words.
+
+    >>> text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]
+
+If we want to train a bigram model, we need to turn this text into bigrams.
+Here's what the first sentence of our text would look like if we use a function
+from NLTK for this.
+
+    >>> from nltk.util import bigrams
+    >>> list(bigrams(text[0]))
+    [('a', 'b'), ('b', 'c')]
+
+Notice how "b" occurs both as the first and second member of different bigrams
+but "a" and "c" don't? Wouldn't it be nice to somehow indicate how often sentences
+start with "a" and end with "c"?
+A standard way to deal with this is to add special "padding" symbols to the
+sentence before splitting it into ngrams.
+Fortunately, NLTK also has a function for that, let's see what it does to the
+first sentence.
+
+    >>> from nltk.util import pad_sequence
+    >>> list(pad_sequence(text[0],
+    ... pad_left=True,
+    ... left_pad_symbol="<s>",
+    ... pad_right=True,
+    ... right_pad_symbol="</s>",
+    ... n=2))
+    ['<s>', 'a', 'b', 'c', '</s>']
+
+Note the `n` argument, that tells the function we need padding for bigrams.
+Now, passing all these parameters every time is tedious and in most cases they
+can be safely assumed as defaults anyway.
+Thus our module provides a convenience function that has all these arguments
+already set while the other arguments remain the same as for `pad_sequence`.
+
+    >>> from nltk.lm.preprocessing import pad_both_ends
+    >>> list(pad_both_ends(text[0], n=2))
+    ['<s>', 'a', 'b', 'c', '</s>']
+
+Combining the two parts discussed so far we get the following preparation steps
+for one sentence.
+
+    >>> list(bigrams(pad_both_ends(text[0], n=2)))
+    [('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]
+
+To make our model more robust we could also train it on unigrams (single words)
+as well as bigrams, its main source of information.
+NLTK once again helpfully provides a function called `everygrams`.
+While not the most efficient, it is conceptually simple.
+
+
+    >>> from nltk.util import everygrams
+    >>> padded_bigrams = list(pad_both_ends(text[0], n=2))
+    >>> list(everygrams(padded_bigrams, max_len=2))
+    [('<s>',), ('<s>', 'a'), ('a',), ('a', 'b'), ('b',), ('b', 'c'), ('c',), ('c', '</s>'), ('</s>',)]
+
+We are almost ready to start counting ngrams, just one more step left.
+During training and evaluation our model will rely on a vocabulary that
+defines which words are "known" to the model.
+To create this vocabulary we need to pad our sentences (just like for counting
+ngrams) and then combine the sentences into one flat stream of words.
+
+    >>> from nltk.lm.preprocessing import flatten
+    >>> list(flatten(pad_both_ends(sent, n=2) for sent in text))
+    ['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']
+
+In most cases we want to use the same text as the source for both vocabulary
+and ngram counts.
+Now that we understand what this means for our preprocessing, we can simply import
+a function that does everything for us.
+
+    >>> from nltk.lm.preprocessing import padded_everygram_pipeline
+    >>> train, vocab = padded_everygram_pipeline(2, text)
+
+So as to avoid re-creating the text in memory, both `train` and `vocab` are lazy
+iterators. They are evaluated on demand at training time.
+
+
+Training
+========
+Having prepared our data we are ready to start training a model.
+As a simple example, let us train a Maximum Likelihood Estimator (MLE).
+We only need to specify the highest ngram order to instantiate it.
+
+    >>> from nltk.lm import MLE
+    >>> lm = MLE(2)
+
+This automatically creates an empty vocabulary...
+
+    >>> len(lm.vocab)
+    0
+
+... which gets filled as we fit the model.
+
+    >>> lm.fit(train, vocab)
+    >>> print(lm.vocab)
+    <Vocabulary with cutoff=1 unk_label='<UNK>' and 9 items>
+    >>> len(lm.vocab)
+    9
+
+The vocabulary helps us handle words that have not occurred during training.
+
+    >>> lm.vocab.lookup(text[0])
+    ('a', 'b', 'c')
+    >>> lm.vocab.lookup(["aliens", "from", "Mars"])
+    ('<UNK>', '<UNK>', '<UNK>')
+
+Moreover, in some cases we want to ignore words that we did see during training
+but that didn't occur frequently enough, to provide us useful information.
+You can tell the vocabulary to ignore such words.
+To find out how that works, check out the docs for the `Vocabulary` class.
+
+
+Using a Trained Model
+=====================
+When it comes to ngram models the training boils down to counting up the ngrams
+from the training corpus.
+
+    >>> print(lm.counts)
+    <NgramCounter with 2 ngram orders and 24 ngrams>
+
+This provides a convenient interface to access counts for unigrams...
+
+    >>> lm.counts['a']
+    2
+
+...and bigrams (in this case "a b")
+
+    >>> lm.counts[['a']]['b']
+    1
+
+And so on. However, the real purpose of training a language model is to have it
+score how probable words are in certain contexts.
+This being MLE, the model returns the item's relative frequency as its score.
+
+    >>> lm.score("a")
+    0.15384615384615385
+
+Items that are not seen during training are mapped to the vocabulary's
+"unknown label" token. This is "<UNK>" by default.
+
+    >>> lm.score("<UNK>") == lm.score("aliens")
+    True
+
+Here's how you get the score for a word given some preceding context.
+For example we want to know what is the chance that "b" is preceded by "a".
+
+    >>> lm.score("b", ["a"])
+    0.5
+
+To avoid underflow when working with many small score values it makes sense to
+take their logarithm.
+For convenience this can be done with the `logscore` method.
+
+    >>> lm.logscore("a")
+    -2.700439718141092
+
+Building on this method, we can also evaluate our model's cross-entropy and
+perplexity with respect to sequences of ngrams.
+
+    >>> test = [('a', 'b'), ('c', 'd')]
+    >>> lm.entropy(test)
+    1.292481250360578
+    >>> lm.perplexity(test)
+    2.449489742783178
+
+It is advisable to preprocess your test text exactly the same way as you did
+the training text.
+
+One cool feature of ngram models is that they can be used to generate text.
+
+    >>> lm.generate(1, random_seed=3)
+    '<s>'
+    >>> lm.generate(5, random_seed=3)
+    ['<s>', 'a', 'b', 'c', 'd']
+
+Provide `random_seed` if you want to consistently reproduce the same text all
+other things being equal. Here we are using it to test the examples.
+
+You can also condition your generation on some preceding text with the `context`
+argument.
+
+    >>> lm.generate(5, text_seed=['c'], random_seed=3)
+    ['</s>', 'c', 'd', 'c', 'd']
+
+Note that an ngram model is restricted in how much preceding context it can
+take into account. For example, a trigram model can only condition its output
+on 2 preceding words. If you pass in a 4-word context, the first two words
+will be ignored.
+"""
+
+from nltk.lm.counter import NgramCounter
+from nltk.lm.models import (
+    MLE,
+    AbsoluteDiscountingInterpolated,
+    KneserNeyInterpolated,
+    Laplace,
+    Lidstone,
+    StupidBackoff,
+    WittenBellInterpolated,
+)
+from nltk.lm.vocabulary import Vocabulary
+
+__all__ = [
+    "Vocabulary",
+    "NgramCounter",
+    "MLE",
+    "Lidstone",
+    "Laplace",
+    "WittenBellInterpolated",
+    "KneserNeyInterpolated",
+    "AbsoluteDiscountingInterpolated",
+    "StupidBackoff",
+]
--- a/backend/venv/Lib/site-packages/nltk/lm/api.py
+++ b/backend/venv/Lib/site-packages/nltk/lm/api.py
@@ -0,0 +1,238 @@
+# Natural Language Toolkit: Language Models
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Authors: Ilia Kurenkov <ilia.kurenkov@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""Language Model Interface."""
+
+import random
+import warnings
+from abc import ABCMeta, abstractmethod
+from bisect import bisect
+from itertools import accumulate
+
+from nltk.lm.counter import NgramCounter
+from nltk.lm.util import log_base2
+from nltk.lm.vocabulary import Vocabulary
+
+
+class Smoothing(metaclass=ABCMeta):
+    """Ngram Smoothing Interface
+
+    Implements Chen & Goodman 1995's idea that all smoothing algorithms have
+    certain features in common. This should ideally allow smoothing algorithms to
+    work both with Backoff and Interpolation.
+    """
+
+    def __init__(self, vocabulary, counter):
+        """
+        :param vocabulary: The Ngram vocabulary object.
+        :type vocabulary: nltk.lm.vocab.Vocabulary
+        :param counter: The counts of the vocabulary items.
+        :type counter: nltk.lm.counter.NgramCounter
+        """
+        self.vocab = vocabulary
+        self.counts = counter
+
+    @abstractmethod
+    def unigram_score(self, word):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def alpha_gamma(self, word, context):
+        raise NotImplementedError()
+
+
+def _mean(items):
+    """Return average (aka mean) for sequence of items."""
+    return sum(items) / len(items)
+
+
+def _random_generator(seed_or_generator):
+    if isinstance(seed_or_generator, random.Random):
+        return seed_or_generator
+    return random.Random(seed_or_generator)
+
+
+def _weighted_choice(population, weights, random_generator=None):
+    """Like random.choice, but with weights.
+
+    Heavily inspired by python 3.6 `random.choices`.
+    """
+    if not population:
+        raise ValueError("Can't choose from empty population")
+    if len(population) != len(weights):
+        raise ValueError("The number of weights does not match the population")
+    cum_weights = list(accumulate(weights))
+    total = cum_weights[-1]
+    threshold = random_generator.random()
+    return population[bisect(cum_weights, total * threshold)]
+
+
+class LanguageModel(metaclass=ABCMeta):
+    """ABC for Language Models.
+
+    Cannot be directly instantiated itself.
+
+    """
+
+    def __init__(self, order, vocabulary=None, counter=None):
+        """Creates new LanguageModel.
+
+        :param vocabulary: If provided, this vocabulary will be used instead
+            of creating a new one when training.
+        :type vocabulary: `nltk.lm.Vocabulary` or None
+        :param counter: If provided, use this object to count ngrams.
+        :type counter: `nltk.lm.NgramCounter` or None
+        :param ngrams_fn: If given, defines how sentences in training text are turned to ngram
+            sequences.
+        :type ngrams_fn: function or None
+        :param pad_fn: If given, defines how sentences in training text are padded.
+        :type pad_fn: function or None
+        """
+        self.order = order
+        if vocabulary and not isinstance(vocabulary, Vocabulary):
+            warnings.warn(
+                f"The `vocabulary` argument passed to {self.__class__.__name__!r} "
+                "must be an instance of `nltk.lm.Vocabulary`.",
+                stacklevel=3,
+            )
+        self.vocab = Vocabulary() if vocabulary is None else vocabulary
+        self.counts = NgramCounter() if counter is None else counter
+
+    def fit(self, text, vocabulary_text=None):
+        """Trains the model on a text.
+
+        :param text: Training text as a sequence of sentences.
+
+        """
+        if not self.vocab:
+            if vocabulary_text is None:
+                raise ValueError(
+                    "Cannot fit without a vocabulary or text to create it from."
+                )
+            self.vocab.update(vocabulary_text)
+        self.counts.update(self.vocab.lookup(sent) for sent in text)
+
+    def score(self, word, context=None):
+        """Masks out of vocab (OOV) words and computes their model score.
+
+        For model-specific logic of calculating scores, see the `unmasked_score`
+        method.
+        """
+        return self.unmasked_score(
+            self.vocab.lookup(word), self.vocab.lookup(context) if context else None
+        )
+
+    @abstractmethod
+    def unmasked_score(self, word, context=None):
+        """Score a word given some optional context.
+
+        Concrete models are expected to provide an implementation.
+        Note that this method does not mask its arguments with the OOV label.
+        Use the `score` method for that.
+
+        :param str word: Word for which we want the score
+        :param tuple(str) context: Context the word is in.
+            If `None`, compute unigram score.
+        :param context: tuple(str) or None
+        :rtype: float
+        """
+        raise NotImplementedError()
+
+    def logscore(self, word, context=None):
+        """Evaluate the log score of this word in this context.
+
+        The arguments are the same as for `score` and `unmasked_score`.
+
+        """
+        return log_base2(self.score(word, context))
+
+    def context_counts(self, context):
+        """Helper method for retrieving counts for a given context.
+
+        Assumes context has been checked and oov words in it masked.
+        :type context: tuple(str) or None
+
+        """
+        return (
+            self.counts[len(context) + 1][context] if context else self.counts.unigrams
+        )
+
+    def entropy(self, text_ngrams):
+        """Calculate cross-entropy of model for given evaluation text.
+
+        This implementation is based on the Shannon-McMillan-Breiman theorem,
+        as used and referenced by Dan Jurafsky and Jordan Boyd-Graber.
+
+        :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples.
+        :rtype: float
+
+        """
+        return -1 * _mean(
+            [self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams]
+        )
+
+    def perplexity(self, text_ngrams):
+        """Calculates the perplexity of the given text.
+
+        This is simply 2 ** cross-entropy for the text, so the arguments are the same.
+
+        """
+        return pow(2.0, self.entropy(text_ngrams))
+
+    def generate(self, num_words=1, text_seed=None, random_seed=None):
+        """Generate words from the model.
+
+        :param int num_words: How many words to generate. By default 1.
+        :param text_seed: Generation can be conditioned on preceding context.
+        :param random_seed: A random seed or an instance of `random.Random`. If provided,
+            makes the random sampling part of generation reproducible.
+        :return: One (str) word or a list of words generated from model.
+
+        Examples:
+
+        >>> from nltk.lm import MLE
+        >>> lm = MLE(2)
+        >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c'])
+        >>> lm.fit([[("a",), ("b",), ("c",)]])
+        >>> lm.generate(random_seed=3)
+        'a'
+        >>> lm.generate(text_seed=['a'])
+        'b'
+
+        """
+        text_seed = [] if text_seed is None else list(text_seed)
+        random_generator = _random_generator(random_seed)
+        # This is the base recursion case.
+        if num_words == 1:
+            context = (
+                text_seed[-self.order + 1 :]
+                if len(text_seed) >= self.order
+                else text_seed
+            )
+            samples = self.context_counts(self.vocab.lookup(context))
+            while context and not samples:
+                context = context[1:] if len(context) > 1 else []
+                samples = self.context_counts(self.vocab.lookup(context))
+            # Sorting samples achieves two things:
+            # - reproducible randomness when sampling
+            # - turns Mapping into Sequence which `_weighted_choice` expects
+            samples = sorted(samples)
+            return _weighted_choice(
+                samples,
+                tuple(self.score(w, context) for w in samples),
+                random_generator,
+            )
+        # We build up text one word at a time using the preceding context.
+        generated = []
+        for _ in range(num_words):
+            generated.append(
+                self.generate(
+                    num_words=1,
+                    text_seed=text_seed + generated,
+                    random_seed=random_generator,
+                )
+            )
+        return generated
--- a/backend/venv/Lib/site-packages/nltk/lm/counter.py
+++ b/backend/venv/Lib/site-packages/nltk/lm/counter.py
@@ -0,0 +1,163 @@
+# Natural Language Toolkit
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+Language Model Counter
+----------------------
+"""
+
+from collections import defaultdict
+from collections.abc import Sequence
+
+from nltk.probability import ConditionalFreqDist, FreqDist
+
+
+class NgramCounter:
+    """Class for counting ngrams.
+
+    Will count any ngram sequence you give it ;)
+
+    First we need to make sure we are feeding the counter sentences of ngrams.
+
+    >>> text = [["a", "b", "c", "d"], ["a", "c", "d", "c"]]
+    >>> from nltk.util import ngrams
+    >>> text_bigrams = [ngrams(sent, 2) for sent in text]
+    >>> text_unigrams = [ngrams(sent, 1) for sent in text]
+
+    The counting itself is very simple.
+
+    >>> from nltk.lm import NgramCounter
+    >>> ngram_counts = NgramCounter(text_bigrams + text_unigrams)
+
+    You can conveniently access ngram counts using standard python dictionary notation.
+    String keys will give you unigram counts.
+
+    >>> ngram_counts['a']
+    2
+    >>> ngram_counts['aliens']
+    0
+
+    If you want to access counts for higher order ngrams, use a list or a tuple.
+    These are treated as "context" keys, so what you get is a frequency distribution
+    over all continuations after the given context.
+
+    >>> sorted(ngram_counts[['a']].items())
+    [('b', 1), ('c', 1)]
+    >>> sorted(ngram_counts[('a',)].items())
+    [('b', 1), ('c', 1)]
+
+    This is equivalent to specifying explicitly the order of the ngram (in this case
+    2 for bigram) and indexing on the context.
+
+    >>> ngram_counts[2][('a',)] is ngram_counts[['a']]
+    True
+
+    Note that the keys in `ConditionalFreqDist` cannot be lists, only tuples!
+    It is generally advisable to use the less verbose and more flexible square
+    bracket notation.
+
+    To get the count of the full ngram "a b", do this:
+
+    >>> ngram_counts[['a']]['b']
+    1
+
+    Specifying the ngram order as a number can be useful for accessing all ngrams
+    in that order.
+
+    >>> ngram_counts[2]
+    <ConditionalFreqDist with 4 conditions>
+
+    The keys of this `ConditionalFreqDist` are the contexts we discussed earlier.
+    Unigrams can also be accessed with a human-friendly alias.
+
+    >>> ngram_counts.unigrams is ngram_counts[1]
+    True
+
+    Similarly to `collections.Counter`, you can update counts after initialization.
+
+    >>> ngram_counts['e']
+    0
+    >>> ngram_counts.update([ngrams(["d", "e", "f"], 1)])
+    >>> ngram_counts['e']
+    1
+
+    """
+
+    def __init__(self, ngram_text=None):
+        """Creates a new NgramCounter.
+
+        If `ngram_text` is specified, counts ngrams from it, otherwise waits for
+        `update` method to be called explicitly.
+
+        :param ngram_text: Optional text containing sentences of ngrams, as for `update` method.
+        :type ngram_text: Iterable(Iterable(tuple(str))) or None
+
+        """
+        self._counts = defaultdict(ConditionalFreqDist)
+        self._counts[1] = self.unigrams = FreqDist()
+
+        if ngram_text:
+            self.update(ngram_text)
+
+    def update(self, ngram_text):
+        """Updates ngram counts from `ngram_text`.
+
+        Expects `ngram_text` to be a sequence of sentences (sequences).
+        Each sentence consists of ngrams as tuples of strings.
+
+        :param Iterable(Iterable(tuple(str))) ngram_text: Text containing sentences of ngrams.
+        :raises TypeError: if the ngrams are not tuples.
+
+        """
+
+        for sent in ngram_text:
+            for ngram in sent:
+                if not isinstance(ngram, tuple):
+                    raise TypeError(
+                        "Ngram <{}> isn't a tuple, " "but {}".format(ngram, type(ngram))
+                    )
+
+                ngram_order = len(ngram)
+                if ngram_order == 1:
+                    self.unigrams[ngram[0]] += 1
+                    continue
+
+                context, word = ngram[:-1], ngram[-1]
+                self[ngram_order][context][word] += 1
+
+    def N(self):
+        """Returns grand total number of ngrams stored.
+
+        This includes ngrams from all orders, so some duplication is expected.
+        :rtype: int
+
+        >>> from nltk.lm import NgramCounter
+        >>> counts = NgramCounter([[("a", "b"), ("c",), ("d", "e")]])
+        >>> counts.N()
+        3
+
+        """
+        return sum(val.N() for val in self._counts.values())
+
+    def __getitem__(self, item):
+        """User-friendly access to ngram counts."""
+        if isinstance(item, int):
+            return self._counts[item]
+        elif isinstance(item, str):
+            return self._counts.__getitem__(1)[item]
+        elif isinstance(item, Sequence):
+            return self._counts.__getitem__(len(item) + 1)[tuple(item)]
+
+    def __str__(self):
+        return "<{} with {} ngram orders and {} ngrams>".format(
+            self.__class__.__name__, len(self._counts), self.N()
+        )
+
+    def __len__(self):
+        return self._counts.__len__()
+
+    def __contains__(self, item):
+        return item in self._counts
--- a/backend/venv/Lib/site-packages/nltk/lm/models.py
+++ b/backend/venv/Lib/site-packages/nltk/lm/models.py
@@ -0,0 +1,141 @@
+# Natural Language Toolkit: Language Models
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
+#         Manu Joseph <manujosephv@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""Language Models"""
+
+from nltk.lm.api import LanguageModel, Smoothing
+from nltk.lm.smoothing import AbsoluteDiscounting, KneserNey, WittenBell
+
+
+class MLE(LanguageModel):
+    """Class for providing MLE ngram model scores.
+
+    Inherits initialization from BaseNgramModel.
+    """
+
+    def unmasked_score(self, word, context=None):
+        """Returns the MLE score for a word given a context.
+
+        Args:
+        - word is expected to be a string
+        - context is expected to be something reasonably convertible to a tuple
+        """
+        return self.context_counts(context).freq(word)
+
+
+class Lidstone(LanguageModel):
+    """Provides Lidstone-smoothed scores.
+
+    In addition to initialization arguments from BaseNgramModel also requires
+    a number by which to increase the counts, gamma.
+    """
+
+    def __init__(self, gamma, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.gamma = gamma
+
+    def unmasked_score(self, word, context=None):
+        """Add-one smoothing: Lidstone or Laplace.
+
+        To see what kind, look at `gamma` attribute on the class.
+
+        """
+        counts = self.context_counts(context)
+        word_count = counts[word]
+        norm_count = counts.N()
+        return (word_count + self.gamma) / (norm_count + len(self.vocab) * self.gamma)
+
+
+class Laplace(Lidstone):
+    """Implements Laplace (add one) smoothing.
+
+    Initialization identical to BaseNgramModel because gamma is always 1.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(1, *args, **kwargs)
+
+
+class StupidBackoff(LanguageModel):
+    """Provides StupidBackoff scores.
+
+    In addition to initialization arguments from BaseNgramModel also requires
+    a parameter alpha with which we scale the lower order probabilities.
+    Note that this is not a true probability distribution as scores for ngrams
+    of the same order do not sum up to unity.
+    """
+
+    def __init__(self, alpha=0.4, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.alpha = alpha
+
+    def unmasked_score(self, word, context=None):
+        if not context:
+            # Base recursion
+            return self.counts.unigrams.freq(word)
+        counts = self.context_counts(context)
+        word_count = counts[word]
+        norm_count = counts.N()
+        if word_count > 0:
+            return word_count / norm_count
+        else:
+            return self.alpha * self.unmasked_score(word, context[1:])
+
+
+class InterpolatedLanguageModel(LanguageModel):
+    """Logic common to all interpolated language models.
+
+    The idea to abstract this comes from Chen & Goodman 1995.
+    Do not instantiate this class directly!
+    """
+
+    def __init__(self, smoothing_cls, order, **kwargs):
+        params = kwargs.pop("params", {})
+        super().__init__(order, **kwargs)
+        self.estimator = smoothing_cls(self.vocab, self.counts, **params)
+
+    def unmasked_score(self, word, context=None):
+        if not context:
+            # The base recursion case: no context, we only have a unigram.
+            return self.estimator.unigram_score(word)
+        if not self.counts[context]:
+            # It can also happen that we have no data for this context.
+            # In that case we defer to the lower-order ngram.
+            # This is the same as setting alpha to 0 and gamma to 1.
+            alpha, gamma = 0, 1
+        else:
+            alpha, gamma = self.estimator.alpha_gamma(word, context)
+        return alpha + gamma * self.unmasked_score(word, context[1:])
+
+
+class WittenBellInterpolated(InterpolatedLanguageModel):
+    """Interpolated version of Witten-Bell smoothing."""
+
+    def __init__(self, order, **kwargs):
+        super().__init__(WittenBell, order, **kwargs)
+
+
+class AbsoluteDiscountingInterpolated(InterpolatedLanguageModel):
+    """Interpolated version of smoothing with absolute discount."""
+
+    def __init__(self, order, discount=0.75, **kwargs):
+        super().__init__(
+            AbsoluteDiscounting, order, params={"discount": discount}, **kwargs
+        )
+
+
+class KneserNeyInterpolated(InterpolatedLanguageModel):
+    """Interpolated version of Kneser-Ney smoothing."""
+
+    def __init__(self, order, discount=0.1, **kwargs):
+        if not (0 <= discount <= 1):
+            raise ValueError(
+                "Discount must be between 0 and 1 for probabilities to sum to unity."
+            )
+        super().__init__(
+            KneserNey, order, params={"discount": discount, "order": order}, **kwargs
+        )
--- a/backend/venv/Lib/site-packages/nltk/lm/preprocessing.py
+++ b/backend/venv/Lib/site-packages/nltk/lm/preprocessing.py
@@ -0,0 +1,51 @@
+# Natural Language Toolkit: Language Model Unit Tests
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+from functools import partial
+from itertools import chain
+
+from nltk.util import everygrams, pad_sequence
+
+flatten = chain.from_iterable
+pad_both_ends = partial(
+    pad_sequence,
+    pad_left=True,
+    left_pad_symbol="<s>",
+    pad_right=True,
+    right_pad_symbol="</s>",
+)
+pad_both_ends.__doc__ = """Pads both ends of a sentence to length specified by ngram order.
+
+    Following convention <s> pads the start of sentence </s> pads its end.
+    """
+
+
+def padded_everygrams(order, sentence):
+    """Helper with some useful defaults.
+
+    Applies pad_both_ends to sentence and follows it up with everygrams.
+    """
+    return everygrams(list(pad_both_ends(sentence, n=order)), max_len=order)
+
+
+def padded_everygram_pipeline(order, text):
+    """Default preprocessing for a sequence of sentences.
+
+    Creates two iterators:
+
+    - sentences padded and turned into sequences of `nltk.util.everygrams`
+    - sentences padded as above and chained together for a flat stream of words
+
+    :param order: Largest ngram length produced by `everygrams`.
+    :param text: Text to iterate over. Expected to be an iterable of sentences.
+    :type text: Iterable[Iterable[str]]
+    :return: iterator over text as ngrams, iterator over text as vocabulary data
+    """
+    padding_fn = partial(pad_both_ends, n=order)
+    return (
+        (everygrams(list(padding_fn(sent)), max_len=order) for sent in text),
+        flatten(map(padding_fn, text)),
+    )
--- a/backend/venv/Lib/site-packages/nltk/lm/smoothing.py
+++ b/backend/venv/Lib/site-packages/nltk/lm/smoothing.py
@@ -0,0 +1,127 @@
+# Natural Language Toolkit: Language Model Unit Tests
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
+#         Manu Joseph <manujosephv@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""Smoothing algorithms for language modeling.
+
+According to Chen & Goodman 1995 these should work with both Backoff and
+Interpolation.
+"""
+from operator import methodcaller
+
+from nltk.lm.api import Smoothing
+from nltk.probability import ConditionalFreqDist
+
+
+def _count_values_gt_zero(distribution):
+    """Count values that are greater than zero in a distribution.
+
+    Assumes distribution is either a mapping with counts as values or
+    an instance of `nltk.ConditionalFreqDist`.
+    """
+    as_count = (
+        methodcaller("N")
+        if isinstance(distribution, ConditionalFreqDist)
+        else lambda count: count
+    )
+    # We explicitly check that values are > 0 to guard against negative counts.
+    return sum(
+        1 for dist_or_count in distribution.values() if as_count(dist_or_count) > 0
+    )
+
+
+class WittenBell(Smoothing):
+    """Witten-Bell smoothing."""
+
+    def __init__(self, vocabulary, counter, **kwargs):
+        super().__init__(vocabulary, counter, **kwargs)
+
+    def alpha_gamma(self, word, context):
+        alpha = self.counts[context].freq(word)
+        gamma = self._gamma(context)
+        return (1.0 - gamma) * alpha, gamma
+
+    def _gamma(self, context):
+        n_plus = _count_values_gt_zero(self.counts[context])
+        return n_plus / (n_plus + self.counts[context].N())
+
+    def unigram_score(self, word):
+        return self.counts.unigrams.freq(word)
+
+
+class AbsoluteDiscounting(Smoothing):
+    """Smoothing with absolute discount."""
+
+    def __init__(self, vocabulary, counter, discount=0.75, **kwargs):
+        super().__init__(vocabulary, counter, **kwargs)
+        self.discount = discount
+
+    def alpha_gamma(self, word, context):
+        alpha = (
+            max(self.counts[context][word] - self.discount, 0)
+            / self.counts[context].N()
+        )
+        gamma = self._gamma(context)
+        return alpha, gamma
+
+    def _gamma(self, context):
+        n_plus = _count_values_gt_zero(self.counts[context])
+        return (self.discount * n_plus) / self.counts[context].N()
+
+    def unigram_score(self, word):
+        return self.counts.unigrams.freq(word)
+
+
+class KneserNey(Smoothing):
+    """Kneser-Ney Smoothing.
+
+    This is an extension of smoothing with a discount.
+
+    Resources:
+    - https://pages.ucsd.edu/~rlevy/lign256/winter2008/kneser_ney_mini_example.pdf
+    - https://www.youtube.com/watch?v=ody1ysUTD7o
+    - https://medium.com/@dennyc/a-simple-numerical-example-for-kneser-ney-smoothing-nlp-4600addf38b8
+    - https://www.cl.uni-heidelberg.de/courses/ss15/smt/scribe6.pdf
+    - https://www-i6.informatik.rwth-aachen.de/publications/download/951/Kneser-ICASSP-1995.pdf
+    """
+
+    def __init__(self, vocabulary, counter, order, discount=0.1, **kwargs):
+        super().__init__(vocabulary, counter, **kwargs)
+        self.discount = discount
+        self._order = order
+
+    def unigram_score(self, word):
+        word_continuation_count, total_count = self._continuation_counts(word)
+        return word_continuation_count / total_count
+
+    def alpha_gamma(self, word, context):
+        prefix_counts = self.counts[context]
+        word_continuation_count, total_count = (
+            (prefix_counts[word], prefix_counts.N())
+            if len(context) + 1 == self._order
+            else self._continuation_counts(word, context)
+        )
+        alpha = max(word_continuation_count - self.discount, 0.0) / total_count
+        gamma = self.discount * _count_values_gt_zero(prefix_counts) / total_count
+        return alpha, gamma
+
+    def _continuation_counts(self, word, context=tuple()):
+        """Count continuations that end with context and word.
+
+        Continuations track unique ngram "types", regardless of how many
+        instances were observed for each "type".
+        This is different than raw ngram counts which track number of instances.
+        """
+        higher_order_ngrams_with_context = (
+            counts
+            for prefix_ngram, counts in self.counts[len(context) + 2].items()
+            if prefix_ngram[1:] == context
+        )
+        higher_order_ngrams_with_word_count, total = 0, 0
+        for counts in higher_order_ngrams_with_context:
+            higher_order_ngrams_with_word_count += int(counts[word] > 0)
+            total += _count_values_gt_zero(counts)
+        return higher_order_ngrams_with_word_count, total
--- a/backend/venv/Lib/site-packages/nltk/lm/util.py
+++ b/backend/venv/Lib/site-packages/nltk/lm/util.py
@@ -0,0 +1,19 @@
+# Natural Language Toolkit
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""Language Model Utilities"""
+
+from math import log
+
+NEG_INF = float("-inf")
+POS_INF = float("inf")
+
+
+def log_base2(score):
+    """Convenience function for computing logarithms with base 2."""
+    if score == 0.0:
+        return NEG_INF
+    return log(score, 2)
--- a/backend/venv/Lib/site-packages/nltk/lm/vocabulary.py
+++ b/backend/venv/Lib/site-packages/nltk/lm/vocabulary.py
@@ -0,0 +1,218 @@
+# Natural Language Toolkit
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""Language Model Vocabulary"""
+
+import sys
+from collections import Counter
+from collections.abc import Iterable
+from functools import singledispatch
+from itertools import chain
+
+
+@singledispatch
+def _dispatched_lookup(words, vocab):
+    raise TypeError(f"Unsupported type for looking up in vocabulary: {type(words)}")
+
+
+@_dispatched_lookup.register(Iterable)
+def _(words, vocab):
+    """Look up a sequence of words in the vocabulary.
+
+    Returns an iterator over looked up words.
+
+    """
+    return tuple(_dispatched_lookup(w, vocab) for w in words)
+
+
+@_dispatched_lookup.register(str)
+def _string_lookup(word, vocab):
+    """Looks up one word in the vocabulary."""
+    return word if word in vocab else vocab.unk_label
+
+
+class Vocabulary:
+    """Stores language model vocabulary.
+
+    Satisfies two common language modeling requirements for a vocabulary:
+
+    - When checking membership and calculating its size, filters items
+      by comparing their counts to a cutoff value.
+    - Adds a special "unknown" token which unseen words are mapped to.
+
+    >>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
+    >>> from nltk.lm import Vocabulary
+    >>> vocab = Vocabulary(words, unk_cutoff=2)
+
+    Tokens with counts greater than or equal to the cutoff value will
+    be considered part of the vocabulary.
+
+    >>> vocab['c']
+    3
+    >>> 'c' in vocab
+    True
+    >>> vocab['d']
+    2
+    >>> 'd' in vocab
+    True
+
+    Tokens with frequency counts less than the cutoff value will be considered not
+    part of the vocabulary even though their entries in the count dictionary are
+    preserved.
+
+    >>> vocab['b']
+    1
+    >>> 'b' in vocab
+    False
+    >>> vocab['aliens']
+    0
+    >>> 'aliens' in vocab
+    False
+
+    Keeping the count entries for seen words allows us to change the cutoff value
+    without having to recalculate the counts.
+
+    >>> vocab2 = Vocabulary(vocab.counts, unk_cutoff=1)
+    >>> "b" in vocab2
+    True
+
+    The cutoff value influences not only membership checking but also the result of
+    getting the size of the vocabulary using the built-in `len`.
+    Note that while the number of keys in the vocabulary's counter stays the same,
+    the items in the vocabulary differ depending on the cutoff.
+    We use `sorted` to demonstrate because it keeps the order consistent.
+
+    >>> sorted(vocab2.counts)
+    ['-', 'a', 'b', 'c', 'd', 'r']
+    >>> sorted(vocab2)
+    ['-', '<UNK>', 'a', 'b', 'c', 'd', 'r']
+    >>> sorted(vocab.counts)
+    ['-', 'a', 'b', 'c', 'd', 'r']
+    >>> sorted(vocab)
+    ['<UNK>', 'a', 'c', 'd']
+
+    In addition to items it gets populated with, the vocabulary stores a special
+    token that stands in for so-called "unknown" items. By default it's "<UNK>".
+
+    >>> "<UNK>" in vocab
+    True
+
+    We can look up words in a vocabulary using its `lookup` method.
+    "Unseen" words (with counts less than cutoff) are looked up as the unknown label.
+    If given one word (a string) as an input, this method will return a string.
+
+    >>> vocab.lookup("a")
+    'a'
+    >>> vocab.lookup("aliens")
+    '<UNK>'
+
+    If given a sequence, it will return an tuple of the looked up words.
+
+    >>> vocab.lookup(["p", 'a', 'r', 'd', 'b', 'c'])
+    ('<UNK>', 'a', '<UNK>', 'd', '<UNK>', 'c')
+
+    It's possible to update the counts after the vocabulary has been created.
+    In general, the interface is the same as that of `collections.Counter`.
+
+    >>> vocab['b']
+    1
+    >>> vocab.update(["b", "b", "c"])
+    >>> vocab['b']
+    3
+    """
+
+    def __init__(self, counts=None, unk_cutoff=1, unk_label="<UNK>"):
+        """Create a new Vocabulary.
+
+        :param counts: Optional iterable or `collections.Counter` instance to
+                       pre-seed the Vocabulary. In case it is iterable, counts
+                       are calculated.
+        :param int unk_cutoff: Words that occur less frequently than this value
+                               are not considered part of the vocabulary.
+        :param unk_label: Label for marking words not part of vocabulary.
+
+        """
+        self.unk_label = unk_label
+        if unk_cutoff < 1:
+            raise ValueError(f"Cutoff value cannot be less than 1. Got: {unk_cutoff}")
+        self._cutoff = unk_cutoff
+
+        self.counts = Counter()
+        self.update(counts if counts is not None else "")
+
+    @property
+    def cutoff(self):
+        """Cutoff value.
+
+        Items with count below this value are not considered part of vocabulary.
+
+        """
+        return self._cutoff
+
+    def update(self, *counter_args, **counter_kwargs):
+        """Update vocabulary counts.
+
+        Wraps `collections.Counter.update` method.
+
+        """
+        self.counts.update(*counter_args, **counter_kwargs)
+        self._len = sum(1 for _ in self)
+
+    def lookup(self, words):
+        """Look up one or more words in the vocabulary.
+
+        If passed one word as a string will return that word or `self.unk_label`.
+        Otherwise will assume it was passed a sequence of words, will try to look
+        each of them up and return an iterator over the looked up words.
+
+        :param words: Word(s) to look up.
+        :type words: Iterable(str) or str
+        :rtype: generator(str) or str
+        :raises: TypeError for types other than strings or iterables
+
+        >>> from nltk.lm import Vocabulary
+        >>> vocab = Vocabulary(["a", "b", "c", "a", "b"], unk_cutoff=2)
+        >>> vocab.lookup("a")
+        'a'
+        >>> vocab.lookup("aliens")
+        '<UNK>'
+        >>> vocab.lookup(["a", "b", "c", ["x", "b"]])
+        ('a', 'b', '<UNK>', ('<UNK>', 'b'))
+
+        """
+        return _dispatched_lookup(words, self)
+
+    def __getitem__(self, item):
+        return self._cutoff if item == self.unk_label else self.counts[item]
+
+    def __contains__(self, item):
+        """Only consider items with counts GE to cutoff as being in the
+        vocabulary."""
+        return self[item] >= self.cutoff
+
+    def __iter__(self):
+        """Building on membership check define how to iterate over
+        vocabulary."""
+        return chain(
+            (item for item in self.counts if item in self),
+            [self.unk_label] if self.counts else [],
+        )
+
+    def __len__(self):
+        """Computing size of vocabulary reflects the cutoff."""
+        return self._len
+
+    def __eq__(self, other):
+        return (
+            self.unk_label == other.unk_label
+            and self.cutoff == other.cutoff
+            and self.counts == other.counts
+        )
+
+    def __str__(self):
+        return "<{} with cutoff={} unk_label='{}' and {} items>".format(
+            self.__class__.__name__, self.cutoff, self.unk_label, len(self)
+        )