Initial commit
This commit is contained in:
235
backend/venv/Lib/site-packages/nltk/lm/__init__.py
Normal file
235
backend/venv/Lib/site-packages/nltk/lm/__init__.py
Normal file
@@ -0,0 +1,235 @@
|
||||
# Natural Language Toolkit: Language Models
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <https://www.nltk.org/
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
NLTK Language Modeling Module.
|
||||
------------------------------
|
||||
|
||||
Currently this module covers only ngram language models, but it should be easy
|
||||
to extend to neural models.
|
||||
|
||||
|
||||
Preparing Data
|
||||
==============
|
||||
|
||||
Before we train our ngram models it is necessary to make sure the data we put in
|
||||
them is in the right format.
|
||||
Let's say we have a text that is a list of sentences, where each sentence is
|
||||
a list of strings. For simplicity we just consider a text consisting of
|
||||
characters instead of words.
|
||||
|
||||
>>> text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]
|
||||
|
||||
If we want to train a bigram model, we need to turn this text into bigrams.
|
||||
Here's what the first sentence of our text would look like if we use a function
|
||||
from NLTK for this.
|
||||
|
||||
>>> from nltk.util import bigrams
|
||||
>>> list(bigrams(text[0]))
|
||||
[('a', 'b'), ('b', 'c')]
|
||||
|
||||
Notice how "b" occurs both as the first and second member of different bigrams
|
||||
but "a" and "c" don't? Wouldn't it be nice to somehow indicate how often sentences
|
||||
start with "a" and end with "c"?
|
||||
A standard way to deal with this is to add special "padding" symbols to the
|
||||
sentence before splitting it into ngrams.
|
||||
Fortunately, NLTK also has a function for that, let's see what it does to the
|
||||
first sentence.
|
||||
|
||||
>>> from nltk.util import pad_sequence
|
||||
>>> list(pad_sequence(text[0],
|
||||
... pad_left=True,
|
||||
... left_pad_symbol="<s>",
|
||||
... pad_right=True,
|
||||
... right_pad_symbol="</s>",
|
||||
... n=2))
|
||||
['<s>', 'a', 'b', 'c', '</s>']
|
||||
|
||||
Note the `n` argument, that tells the function we need padding for bigrams.
|
||||
Now, passing all these parameters every time is tedious and in most cases they
|
||||
can be safely assumed as defaults anyway.
|
||||
Thus our module provides a convenience function that has all these arguments
|
||||
already set while the other arguments remain the same as for `pad_sequence`.
|
||||
|
||||
>>> from nltk.lm.preprocessing import pad_both_ends
|
||||
>>> list(pad_both_ends(text[0], n=2))
|
||||
['<s>', 'a', 'b', 'c', '</s>']
|
||||
|
||||
Combining the two parts discussed so far we get the following preparation steps
|
||||
for one sentence.
|
||||
|
||||
>>> list(bigrams(pad_both_ends(text[0], n=2)))
|
||||
[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]
|
||||
|
||||
To make our model more robust we could also train it on unigrams (single words)
|
||||
as well as bigrams, its main source of information.
|
||||
NLTK once again helpfully provides a function called `everygrams`.
|
||||
While not the most efficient, it is conceptually simple.
|
||||
|
||||
|
||||
>>> from nltk.util import everygrams
|
||||
>>> padded_bigrams = list(pad_both_ends(text[0], n=2))
|
||||
>>> list(everygrams(padded_bigrams, max_len=2))
|
||||
[('<s>',), ('<s>', 'a'), ('a',), ('a', 'b'), ('b',), ('b', 'c'), ('c',), ('c', '</s>'), ('</s>',)]
|
||||
|
||||
We are almost ready to start counting ngrams, just one more step left.
|
||||
During training and evaluation our model will rely on a vocabulary that
|
||||
defines which words are "known" to the model.
|
||||
To create this vocabulary we need to pad our sentences (just like for counting
|
||||
ngrams) and then combine the sentences into one flat stream of words.
|
||||
|
||||
>>> from nltk.lm.preprocessing import flatten
|
||||
>>> list(flatten(pad_both_ends(sent, n=2) for sent in text))
|
||||
['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']
|
||||
|
||||
In most cases we want to use the same text as the source for both vocabulary
|
||||
and ngram counts.
|
||||
Now that we understand what this means for our preprocessing, we can simply import
|
||||
a function that does everything for us.
|
||||
|
||||
>>> from nltk.lm.preprocessing import padded_everygram_pipeline
|
||||
>>> train, vocab = padded_everygram_pipeline(2, text)
|
||||
|
||||
So as to avoid re-creating the text in memory, both `train` and `vocab` are lazy
|
||||
iterators. They are evaluated on demand at training time.
|
||||
|
||||
|
||||
Training
|
||||
========
|
||||
Having prepared our data we are ready to start training a model.
|
||||
As a simple example, let us train a Maximum Likelihood Estimator (MLE).
|
||||
We only need to specify the highest ngram order to instantiate it.
|
||||
|
||||
>>> from nltk.lm import MLE
|
||||
>>> lm = MLE(2)
|
||||
|
||||
This automatically creates an empty vocabulary...
|
||||
|
||||
>>> len(lm.vocab)
|
||||
0
|
||||
|
||||
... which gets filled as we fit the model.
|
||||
|
||||
>>> lm.fit(train, vocab)
|
||||
>>> print(lm.vocab)
|
||||
<Vocabulary with cutoff=1 unk_label='<UNK>' and 9 items>
|
||||
>>> len(lm.vocab)
|
||||
9
|
||||
|
||||
The vocabulary helps us handle words that have not occurred during training.
|
||||
|
||||
>>> lm.vocab.lookup(text[0])
|
||||
('a', 'b', 'c')
|
||||
>>> lm.vocab.lookup(["aliens", "from", "Mars"])
|
||||
('<UNK>', '<UNK>', '<UNK>')
|
||||
|
||||
Moreover, in some cases we want to ignore words that we did see during training
|
||||
but that didn't occur frequently enough, to provide us useful information.
|
||||
You can tell the vocabulary to ignore such words.
|
||||
To find out how that works, check out the docs for the `Vocabulary` class.
|
||||
|
||||
|
||||
Using a Trained Model
|
||||
=====================
|
||||
When it comes to ngram models the training boils down to counting up the ngrams
|
||||
from the training corpus.
|
||||
|
||||
>>> print(lm.counts)
|
||||
<NgramCounter with 2 ngram orders and 24 ngrams>
|
||||
|
||||
This provides a convenient interface to access counts for unigrams...
|
||||
|
||||
>>> lm.counts['a']
|
||||
2
|
||||
|
||||
...and bigrams (in this case "a b")
|
||||
|
||||
>>> lm.counts[['a']]['b']
|
||||
1
|
||||
|
||||
And so on. However, the real purpose of training a language model is to have it
|
||||
score how probable words are in certain contexts.
|
||||
This being MLE, the model returns the item's relative frequency as its score.
|
||||
|
||||
>>> lm.score("a")
|
||||
0.15384615384615385
|
||||
|
||||
Items that are not seen during training are mapped to the vocabulary's
|
||||
"unknown label" token. This is "<UNK>" by default.
|
||||
|
||||
>>> lm.score("<UNK>") == lm.score("aliens")
|
||||
True
|
||||
|
||||
Here's how you get the score for a word given some preceding context.
|
||||
For example we want to know what is the chance that "b" is preceded by "a".
|
||||
|
||||
>>> lm.score("b", ["a"])
|
||||
0.5
|
||||
|
||||
To avoid underflow when working with many small score values it makes sense to
|
||||
take their logarithm.
|
||||
For convenience this can be done with the `logscore` method.
|
||||
|
||||
>>> lm.logscore("a")
|
||||
-2.700439718141092
|
||||
|
||||
Building on this method, we can also evaluate our model's cross-entropy and
|
||||
perplexity with respect to sequences of ngrams.
|
||||
|
||||
>>> test = [('a', 'b'), ('c', 'd')]
|
||||
>>> lm.entropy(test)
|
||||
1.292481250360578
|
||||
>>> lm.perplexity(test)
|
||||
2.449489742783178
|
||||
|
||||
It is advisable to preprocess your test text exactly the same way as you did
|
||||
the training text.
|
||||
|
||||
One cool feature of ngram models is that they can be used to generate text.
|
||||
|
||||
>>> lm.generate(1, random_seed=3)
|
||||
'<s>'
|
||||
>>> lm.generate(5, random_seed=3)
|
||||
['<s>', 'a', 'b', 'c', 'd']
|
||||
|
||||
Provide `random_seed` if you want to consistently reproduce the same text all
|
||||
other things being equal. Here we are using it to test the examples.
|
||||
|
||||
You can also condition your generation on some preceding text with the `context`
|
||||
argument.
|
||||
|
||||
>>> lm.generate(5, text_seed=['c'], random_seed=3)
|
||||
['</s>', 'c', 'd', 'c', 'd']
|
||||
|
||||
Note that an ngram model is restricted in how much preceding context it can
|
||||
take into account. For example, a trigram model can only condition its output
|
||||
on 2 preceding words. If you pass in a 4-word context, the first two words
|
||||
will be ignored.
|
||||
"""
|
||||
|
||||
from nltk.lm.counter import NgramCounter
|
||||
from nltk.lm.models import (
|
||||
MLE,
|
||||
AbsoluteDiscountingInterpolated,
|
||||
KneserNeyInterpolated,
|
||||
Laplace,
|
||||
Lidstone,
|
||||
StupidBackoff,
|
||||
WittenBellInterpolated,
|
||||
)
|
||||
from nltk.lm.vocabulary import Vocabulary
|
||||
|
||||
__all__ = [
|
||||
"Vocabulary",
|
||||
"NgramCounter",
|
||||
"MLE",
|
||||
"Lidstone",
|
||||
"Laplace",
|
||||
"WittenBellInterpolated",
|
||||
"KneserNeyInterpolated",
|
||||
"AbsoluteDiscountingInterpolated",
|
||||
"StupidBackoff",
|
||||
]
|
||||
238
backend/venv/Lib/site-packages/nltk/lm/api.py
Normal file
238
backend/venv/Lib/site-packages/nltk/lm/api.py
Normal file
@@ -0,0 +1,238 @@
|
||||
# Natural Language Toolkit: Language Models
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Language Model Interface."""
|
||||
|
||||
import random
|
||||
import warnings
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from bisect import bisect
|
||||
from itertools import accumulate
|
||||
|
||||
from nltk.lm.counter import NgramCounter
|
||||
from nltk.lm.util import log_base2
|
||||
from nltk.lm.vocabulary import Vocabulary
|
||||
|
||||
|
||||
class Smoothing(metaclass=ABCMeta):
|
||||
"""Ngram Smoothing Interface
|
||||
|
||||
Implements Chen & Goodman 1995's idea that all smoothing algorithms have
|
||||
certain features in common. This should ideally allow smoothing algorithms to
|
||||
work both with Backoff and Interpolation.
|
||||
"""
|
||||
|
||||
def __init__(self, vocabulary, counter):
|
||||
"""
|
||||
:param vocabulary: The Ngram vocabulary object.
|
||||
:type vocabulary: nltk.lm.vocab.Vocabulary
|
||||
:param counter: The counts of the vocabulary items.
|
||||
:type counter: nltk.lm.counter.NgramCounter
|
||||
"""
|
||||
self.vocab = vocabulary
|
||||
self.counts = counter
|
||||
|
||||
@abstractmethod
|
||||
def unigram_score(self, word):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def alpha_gamma(self, word, context):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def _mean(items):
|
||||
"""Return average (aka mean) for sequence of items."""
|
||||
return sum(items) / len(items)
|
||||
|
||||
|
||||
def _random_generator(seed_or_generator):
|
||||
if isinstance(seed_or_generator, random.Random):
|
||||
return seed_or_generator
|
||||
return random.Random(seed_or_generator)
|
||||
|
||||
|
||||
def _weighted_choice(population, weights, random_generator=None):
|
||||
"""Like random.choice, but with weights.
|
||||
|
||||
Heavily inspired by python 3.6 `random.choices`.
|
||||
"""
|
||||
if not population:
|
||||
raise ValueError("Can't choose from empty population")
|
||||
if len(population) != len(weights):
|
||||
raise ValueError("The number of weights does not match the population")
|
||||
cum_weights = list(accumulate(weights))
|
||||
total = cum_weights[-1]
|
||||
threshold = random_generator.random()
|
||||
return population[bisect(cum_weights, total * threshold)]
|
||||
|
||||
|
||||
class LanguageModel(metaclass=ABCMeta):
|
||||
"""ABC for Language Models.
|
||||
|
||||
Cannot be directly instantiated itself.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, order, vocabulary=None, counter=None):
|
||||
"""Creates new LanguageModel.
|
||||
|
||||
:param vocabulary: If provided, this vocabulary will be used instead
|
||||
of creating a new one when training.
|
||||
:type vocabulary: `nltk.lm.Vocabulary` or None
|
||||
:param counter: If provided, use this object to count ngrams.
|
||||
:type counter: `nltk.lm.NgramCounter` or None
|
||||
:param ngrams_fn: If given, defines how sentences in training text are turned to ngram
|
||||
sequences.
|
||||
:type ngrams_fn: function or None
|
||||
:param pad_fn: If given, defines how sentences in training text are padded.
|
||||
:type pad_fn: function or None
|
||||
"""
|
||||
self.order = order
|
||||
if vocabulary and not isinstance(vocabulary, Vocabulary):
|
||||
warnings.warn(
|
||||
f"The `vocabulary` argument passed to {self.__class__.__name__!r} "
|
||||
"must be an instance of `nltk.lm.Vocabulary`.",
|
||||
stacklevel=3,
|
||||
)
|
||||
self.vocab = Vocabulary() if vocabulary is None else vocabulary
|
||||
self.counts = NgramCounter() if counter is None else counter
|
||||
|
||||
def fit(self, text, vocabulary_text=None):
|
||||
"""Trains the model on a text.
|
||||
|
||||
:param text: Training text as a sequence of sentences.
|
||||
|
||||
"""
|
||||
if not self.vocab:
|
||||
if vocabulary_text is None:
|
||||
raise ValueError(
|
||||
"Cannot fit without a vocabulary or text to create it from."
|
||||
)
|
||||
self.vocab.update(vocabulary_text)
|
||||
self.counts.update(self.vocab.lookup(sent) for sent in text)
|
||||
|
||||
def score(self, word, context=None):
|
||||
"""Masks out of vocab (OOV) words and computes their model score.
|
||||
|
||||
For model-specific logic of calculating scores, see the `unmasked_score`
|
||||
method.
|
||||
"""
|
||||
return self.unmasked_score(
|
||||
self.vocab.lookup(word), self.vocab.lookup(context) if context else None
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def unmasked_score(self, word, context=None):
|
||||
"""Score a word given some optional context.
|
||||
|
||||
Concrete models are expected to provide an implementation.
|
||||
Note that this method does not mask its arguments with the OOV label.
|
||||
Use the `score` method for that.
|
||||
|
||||
:param str word: Word for which we want the score
|
||||
:param tuple(str) context: Context the word is in.
|
||||
If `None`, compute unigram score.
|
||||
:param context: tuple(str) or None
|
||||
:rtype: float
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def logscore(self, word, context=None):
|
||||
"""Evaluate the log score of this word in this context.
|
||||
|
||||
The arguments are the same as for `score` and `unmasked_score`.
|
||||
|
||||
"""
|
||||
return log_base2(self.score(word, context))
|
||||
|
||||
def context_counts(self, context):
|
||||
"""Helper method for retrieving counts for a given context.
|
||||
|
||||
Assumes context has been checked and oov words in it masked.
|
||||
:type context: tuple(str) or None
|
||||
|
||||
"""
|
||||
return (
|
||||
self.counts[len(context) + 1][context] if context else self.counts.unigrams
|
||||
)
|
||||
|
||||
def entropy(self, text_ngrams):
|
||||
"""Calculate cross-entropy of model for given evaluation text.
|
||||
|
||||
This implementation is based on the Shannon-McMillan-Breiman theorem,
|
||||
as used and referenced by Dan Jurafsky and Jordan Boyd-Graber.
|
||||
|
||||
:param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples.
|
||||
:rtype: float
|
||||
|
||||
"""
|
||||
return -1 * _mean(
|
||||
[self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams]
|
||||
)
|
||||
|
||||
def perplexity(self, text_ngrams):
|
||||
"""Calculates the perplexity of the given text.
|
||||
|
||||
This is simply 2 ** cross-entropy for the text, so the arguments are the same.
|
||||
|
||||
"""
|
||||
return pow(2.0, self.entropy(text_ngrams))
|
||||
|
||||
def generate(self, num_words=1, text_seed=None, random_seed=None):
|
||||
"""Generate words from the model.
|
||||
|
||||
:param int num_words: How many words to generate. By default 1.
|
||||
:param text_seed: Generation can be conditioned on preceding context.
|
||||
:param random_seed: A random seed or an instance of `random.Random`. If provided,
|
||||
makes the random sampling part of generation reproducible.
|
||||
:return: One (str) word or a list of words generated from model.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> from nltk.lm import MLE
|
||||
>>> lm = MLE(2)
|
||||
>>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c'])
|
||||
>>> lm.fit([[("a",), ("b",), ("c",)]])
|
||||
>>> lm.generate(random_seed=3)
|
||||
'a'
|
||||
>>> lm.generate(text_seed=['a'])
|
||||
'b'
|
||||
|
||||
"""
|
||||
text_seed = [] if text_seed is None else list(text_seed)
|
||||
random_generator = _random_generator(random_seed)
|
||||
# This is the base recursion case.
|
||||
if num_words == 1:
|
||||
context = (
|
||||
text_seed[-self.order + 1 :]
|
||||
if len(text_seed) >= self.order
|
||||
else text_seed
|
||||
)
|
||||
samples = self.context_counts(self.vocab.lookup(context))
|
||||
while context and not samples:
|
||||
context = context[1:] if len(context) > 1 else []
|
||||
samples = self.context_counts(self.vocab.lookup(context))
|
||||
# Sorting samples achieves two things:
|
||||
# - reproducible randomness when sampling
|
||||
# - turns Mapping into Sequence which `_weighted_choice` expects
|
||||
samples = sorted(samples)
|
||||
return _weighted_choice(
|
||||
samples,
|
||||
tuple(self.score(w, context) for w in samples),
|
||||
random_generator,
|
||||
)
|
||||
# We build up text one word at a time using the preceding context.
|
||||
generated = []
|
||||
for _ in range(num_words):
|
||||
generated.append(
|
||||
self.generate(
|
||||
num_words=1,
|
||||
text_seed=text_seed + generated,
|
||||
random_seed=random_generator,
|
||||
)
|
||||
)
|
||||
return generated
|
||||
163
backend/venv/Lib/site-packages/nltk/lm/counter.py
Normal file
163
backend/venv/Lib/site-packages/nltk/lm/counter.py
Normal file
@@ -0,0 +1,163 @@
|
||||
# Natural Language Toolkit
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
Language Model Counter
|
||||
----------------------
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
from collections.abc import Sequence
|
||||
|
||||
from nltk.probability import ConditionalFreqDist, FreqDist
|
||||
|
||||
|
||||
class NgramCounter:
|
||||
"""Class for counting ngrams.
|
||||
|
||||
Will count any ngram sequence you give it ;)
|
||||
|
||||
First we need to make sure we are feeding the counter sentences of ngrams.
|
||||
|
||||
>>> text = [["a", "b", "c", "d"], ["a", "c", "d", "c"]]
|
||||
>>> from nltk.util import ngrams
|
||||
>>> text_bigrams = [ngrams(sent, 2) for sent in text]
|
||||
>>> text_unigrams = [ngrams(sent, 1) for sent in text]
|
||||
|
||||
The counting itself is very simple.
|
||||
|
||||
>>> from nltk.lm import NgramCounter
|
||||
>>> ngram_counts = NgramCounter(text_bigrams + text_unigrams)
|
||||
|
||||
You can conveniently access ngram counts using standard python dictionary notation.
|
||||
String keys will give you unigram counts.
|
||||
|
||||
>>> ngram_counts['a']
|
||||
2
|
||||
>>> ngram_counts['aliens']
|
||||
0
|
||||
|
||||
If you want to access counts for higher order ngrams, use a list or a tuple.
|
||||
These are treated as "context" keys, so what you get is a frequency distribution
|
||||
over all continuations after the given context.
|
||||
|
||||
>>> sorted(ngram_counts[['a']].items())
|
||||
[('b', 1), ('c', 1)]
|
||||
>>> sorted(ngram_counts[('a',)].items())
|
||||
[('b', 1), ('c', 1)]
|
||||
|
||||
This is equivalent to specifying explicitly the order of the ngram (in this case
|
||||
2 for bigram) and indexing on the context.
|
||||
|
||||
>>> ngram_counts[2][('a',)] is ngram_counts[['a']]
|
||||
True
|
||||
|
||||
Note that the keys in `ConditionalFreqDist` cannot be lists, only tuples!
|
||||
It is generally advisable to use the less verbose and more flexible square
|
||||
bracket notation.
|
||||
|
||||
To get the count of the full ngram "a b", do this:
|
||||
|
||||
>>> ngram_counts[['a']]['b']
|
||||
1
|
||||
|
||||
Specifying the ngram order as a number can be useful for accessing all ngrams
|
||||
in that order.
|
||||
|
||||
>>> ngram_counts[2]
|
||||
<ConditionalFreqDist with 4 conditions>
|
||||
|
||||
The keys of this `ConditionalFreqDist` are the contexts we discussed earlier.
|
||||
Unigrams can also be accessed with a human-friendly alias.
|
||||
|
||||
>>> ngram_counts.unigrams is ngram_counts[1]
|
||||
True
|
||||
|
||||
Similarly to `collections.Counter`, you can update counts after initialization.
|
||||
|
||||
>>> ngram_counts['e']
|
||||
0
|
||||
>>> ngram_counts.update([ngrams(["d", "e", "f"], 1)])
|
||||
>>> ngram_counts['e']
|
||||
1
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, ngram_text=None):
|
||||
"""Creates a new NgramCounter.
|
||||
|
||||
If `ngram_text` is specified, counts ngrams from it, otherwise waits for
|
||||
`update` method to be called explicitly.
|
||||
|
||||
:param ngram_text: Optional text containing sentences of ngrams, as for `update` method.
|
||||
:type ngram_text: Iterable(Iterable(tuple(str))) or None
|
||||
|
||||
"""
|
||||
self._counts = defaultdict(ConditionalFreqDist)
|
||||
self._counts[1] = self.unigrams = FreqDist()
|
||||
|
||||
if ngram_text:
|
||||
self.update(ngram_text)
|
||||
|
||||
def update(self, ngram_text):
|
||||
"""Updates ngram counts from `ngram_text`.
|
||||
|
||||
Expects `ngram_text` to be a sequence of sentences (sequences).
|
||||
Each sentence consists of ngrams as tuples of strings.
|
||||
|
||||
:param Iterable(Iterable(tuple(str))) ngram_text: Text containing sentences of ngrams.
|
||||
:raises TypeError: if the ngrams are not tuples.
|
||||
|
||||
"""
|
||||
|
||||
for sent in ngram_text:
|
||||
for ngram in sent:
|
||||
if not isinstance(ngram, tuple):
|
||||
raise TypeError(
|
||||
"Ngram <{}> isn't a tuple, " "but {}".format(ngram, type(ngram))
|
||||
)
|
||||
|
||||
ngram_order = len(ngram)
|
||||
if ngram_order == 1:
|
||||
self.unigrams[ngram[0]] += 1
|
||||
continue
|
||||
|
||||
context, word = ngram[:-1], ngram[-1]
|
||||
self[ngram_order][context][word] += 1
|
||||
|
||||
def N(self):
|
||||
"""Returns grand total number of ngrams stored.
|
||||
|
||||
This includes ngrams from all orders, so some duplication is expected.
|
||||
:rtype: int
|
||||
|
||||
>>> from nltk.lm import NgramCounter
|
||||
>>> counts = NgramCounter([[("a", "b"), ("c",), ("d", "e")]])
|
||||
>>> counts.N()
|
||||
3
|
||||
|
||||
"""
|
||||
return sum(val.N() for val in self._counts.values())
|
||||
|
||||
def __getitem__(self, item):
|
||||
"""User-friendly access to ngram counts."""
|
||||
if isinstance(item, int):
|
||||
return self._counts[item]
|
||||
elif isinstance(item, str):
|
||||
return self._counts.__getitem__(1)[item]
|
||||
elif isinstance(item, Sequence):
|
||||
return self._counts.__getitem__(len(item) + 1)[tuple(item)]
|
||||
|
||||
def __str__(self):
|
||||
return "<{} with {} ngram orders and {} ngrams>".format(
|
||||
self.__class__.__name__, len(self._counts), self.N()
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return self._counts.__len__()
|
||||
|
||||
def __contains__(self, item):
|
||||
return item in self._counts
|
||||
141
backend/venv/Lib/site-packages/nltk/lm/models.py
Normal file
141
backend/venv/Lib/site-packages/nltk/lm/models.py
Normal file
@@ -0,0 +1,141 @@
|
||||
# Natural Language Toolkit: Language Models
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# Manu Joseph <manujosephv@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Language Models"""
|
||||
|
||||
from nltk.lm.api import LanguageModel, Smoothing
|
||||
from nltk.lm.smoothing import AbsoluteDiscounting, KneserNey, WittenBell
|
||||
|
||||
|
||||
class MLE(LanguageModel):
|
||||
"""Class for providing MLE ngram model scores.
|
||||
|
||||
Inherits initialization from BaseNgramModel.
|
||||
"""
|
||||
|
||||
def unmasked_score(self, word, context=None):
|
||||
"""Returns the MLE score for a word given a context.
|
||||
|
||||
Args:
|
||||
- word is expected to be a string
|
||||
- context is expected to be something reasonably convertible to a tuple
|
||||
"""
|
||||
return self.context_counts(context).freq(word)
|
||||
|
||||
|
||||
class Lidstone(LanguageModel):
|
||||
"""Provides Lidstone-smoothed scores.
|
||||
|
||||
In addition to initialization arguments from BaseNgramModel also requires
|
||||
a number by which to increase the counts, gamma.
|
||||
"""
|
||||
|
||||
def __init__(self, gamma, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.gamma = gamma
|
||||
|
||||
def unmasked_score(self, word, context=None):
|
||||
"""Add-one smoothing: Lidstone or Laplace.
|
||||
|
||||
To see what kind, look at `gamma` attribute on the class.
|
||||
|
||||
"""
|
||||
counts = self.context_counts(context)
|
||||
word_count = counts[word]
|
||||
norm_count = counts.N()
|
||||
return (word_count + self.gamma) / (norm_count + len(self.vocab) * self.gamma)
|
||||
|
||||
|
||||
class Laplace(Lidstone):
|
||||
"""Implements Laplace (add one) smoothing.
|
||||
|
||||
Initialization identical to BaseNgramModel because gamma is always 1.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(1, *args, **kwargs)
|
||||
|
||||
|
||||
class StupidBackoff(LanguageModel):
|
||||
"""Provides StupidBackoff scores.
|
||||
|
||||
In addition to initialization arguments from BaseNgramModel also requires
|
||||
a parameter alpha with which we scale the lower order probabilities.
|
||||
Note that this is not a true probability distribution as scores for ngrams
|
||||
of the same order do not sum up to unity.
|
||||
"""
|
||||
|
||||
def __init__(self, alpha=0.4, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.alpha = alpha
|
||||
|
||||
def unmasked_score(self, word, context=None):
|
||||
if not context:
|
||||
# Base recursion
|
||||
return self.counts.unigrams.freq(word)
|
||||
counts = self.context_counts(context)
|
||||
word_count = counts[word]
|
||||
norm_count = counts.N()
|
||||
if word_count > 0:
|
||||
return word_count / norm_count
|
||||
else:
|
||||
return self.alpha * self.unmasked_score(word, context[1:])
|
||||
|
||||
|
||||
class InterpolatedLanguageModel(LanguageModel):
|
||||
"""Logic common to all interpolated language models.
|
||||
|
||||
The idea to abstract this comes from Chen & Goodman 1995.
|
||||
Do not instantiate this class directly!
|
||||
"""
|
||||
|
||||
def __init__(self, smoothing_cls, order, **kwargs):
|
||||
params = kwargs.pop("params", {})
|
||||
super().__init__(order, **kwargs)
|
||||
self.estimator = smoothing_cls(self.vocab, self.counts, **params)
|
||||
|
||||
def unmasked_score(self, word, context=None):
|
||||
if not context:
|
||||
# The base recursion case: no context, we only have a unigram.
|
||||
return self.estimator.unigram_score(word)
|
||||
if not self.counts[context]:
|
||||
# It can also happen that we have no data for this context.
|
||||
# In that case we defer to the lower-order ngram.
|
||||
# This is the same as setting alpha to 0 and gamma to 1.
|
||||
alpha, gamma = 0, 1
|
||||
else:
|
||||
alpha, gamma = self.estimator.alpha_gamma(word, context)
|
||||
return alpha + gamma * self.unmasked_score(word, context[1:])
|
||||
|
||||
|
||||
class WittenBellInterpolated(InterpolatedLanguageModel):
|
||||
"""Interpolated version of Witten-Bell smoothing."""
|
||||
|
||||
def __init__(self, order, **kwargs):
|
||||
super().__init__(WittenBell, order, **kwargs)
|
||||
|
||||
|
||||
class AbsoluteDiscountingInterpolated(InterpolatedLanguageModel):
|
||||
"""Interpolated version of smoothing with absolute discount."""
|
||||
|
||||
def __init__(self, order, discount=0.75, **kwargs):
|
||||
super().__init__(
|
||||
AbsoluteDiscounting, order, params={"discount": discount}, **kwargs
|
||||
)
|
||||
|
||||
|
||||
class KneserNeyInterpolated(InterpolatedLanguageModel):
|
||||
"""Interpolated version of Kneser-Ney smoothing."""
|
||||
|
||||
def __init__(self, order, discount=0.1, **kwargs):
|
||||
if not (0 <= discount <= 1):
|
||||
raise ValueError(
|
||||
"Discount must be between 0 and 1 for probabilities to sum to unity."
|
||||
)
|
||||
super().__init__(
|
||||
KneserNey, order, params={"discount": discount, "order": order}, **kwargs
|
||||
)
|
||||
51
backend/venv/Lib/site-packages/nltk/lm/preprocessing.py
Normal file
51
backend/venv/Lib/site-packages/nltk/lm/preprocessing.py
Normal file
@@ -0,0 +1,51 @@
|
||||
# Natural Language Toolkit: Language Model Unit Tests
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
from functools import partial
|
||||
from itertools import chain
|
||||
|
||||
from nltk.util import everygrams, pad_sequence
|
||||
|
||||
flatten = chain.from_iterable
|
||||
pad_both_ends = partial(
|
||||
pad_sequence,
|
||||
pad_left=True,
|
||||
left_pad_symbol="<s>",
|
||||
pad_right=True,
|
||||
right_pad_symbol="</s>",
|
||||
)
|
||||
pad_both_ends.__doc__ = """Pads both ends of a sentence to length specified by ngram order.
|
||||
|
||||
Following convention <s> pads the start of sentence </s> pads its end.
|
||||
"""
|
||||
|
||||
|
||||
def padded_everygrams(order, sentence):
|
||||
"""Helper with some useful defaults.
|
||||
|
||||
Applies pad_both_ends to sentence and follows it up with everygrams.
|
||||
"""
|
||||
return everygrams(list(pad_both_ends(sentence, n=order)), max_len=order)
|
||||
|
||||
|
||||
def padded_everygram_pipeline(order, text):
|
||||
"""Default preprocessing for a sequence of sentences.
|
||||
|
||||
Creates two iterators:
|
||||
|
||||
- sentences padded and turned into sequences of `nltk.util.everygrams`
|
||||
- sentences padded as above and chained together for a flat stream of words
|
||||
|
||||
:param order: Largest ngram length produced by `everygrams`.
|
||||
:param text: Text to iterate over. Expected to be an iterable of sentences.
|
||||
:type text: Iterable[Iterable[str]]
|
||||
:return: iterator over text as ngrams, iterator over text as vocabulary data
|
||||
"""
|
||||
padding_fn = partial(pad_both_ends, n=order)
|
||||
return (
|
||||
(everygrams(list(padding_fn(sent)), max_len=order) for sent in text),
|
||||
flatten(map(padding_fn, text)),
|
||||
)
|
||||
127
backend/venv/Lib/site-packages/nltk/lm/smoothing.py
Normal file
127
backend/venv/Lib/site-packages/nltk/lm/smoothing.py
Normal file
@@ -0,0 +1,127 @@
|
||||
# Natural Language Toolkit: Language Model Unit Tests
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# Manu Joseph <manujosephv@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Smoothing algorithms for language modeling.
|
||||
|
||||
According to Chen & Goodman 1995 these should work with both Backoff and
|
||||
Interpolation.
|
||||
"""
|
||||
from operator import methodcaller
|
||||
|
||||
from nltk.lm.api import Smoothing
|
||||
from nltk.probability import ConditionalFreqDist
|
||||
|
||||
|
||||
def _count_values_gt_zero(distribution):
|
||||
"""Count values that are greater than zero in a distribution.
|
||||
|
||||
Assumes distribution is either a mapping with counts as values or
|
||||
an instance of `nltk.ConditionalFreqDist`.
|
||||
"""
|
||||
as_count = (
|
||||
methodcaller("N")
|
||||
if isinstance(distribution, ConditionalFreqDist)
|
||||
else lambda count: count
|
||||
)
|
||||
# We explicitly check that values are > 0 to guard against negative counts.
|
||||
return sum(
|
||||
1 for dist_or_count in distribution.values() if as_count(dist_or_count) > 0
|
||||
)
|
||||
|
||||
|
||||
class WittenBell(Smoothing):
|
||||
"""Witten-Bell smoothing."""
|
||||
|
||||
def __init__(self, vocabulary, counter, **kwargs):
|
||||
super().__init__(vocabulary, counter, **kwargs)
|
||||
|
||||
def alpha_gamma(self, word, context):
|
||||
alpha = self.counts[context].freq(word)
|
||||
gamma = self._gamma(context)
|
||||
return (1.0 - gamma) * alpha, gamma
|
||||
|
||||
def _gamma(self, context):
|
||||
n_plus = _count_values_gt_zero(self.counts[context])
|
||||
return n_plus / (n_plus + self.counts[context].N())
|
||||
|
||||
def unigram_score(self, word):
|
||||
return self.counts.unigrams.freq(word)
|
||||
|
||||
|
||||
class AbsoluteDiscounting(Smoothing):
|
||||
"""Smoothing with absolute discount."""
|
||||
|
||||
def __init__(self, vocabulary, counter, discount=0.75, **kwargs):
|
||||
super().__init__(vocabulary, counter, **kwargs)
|
||||
self.discount = discount
|
||||
|
||||
def alpha_gamma(self, word, context):
|
||||
alpha = (
|
||||
max(self.counts[context][word] - self.discount, 0)
|
||||
/ self.counts[context].N()
|
||||
)
|
||||
gamma = self._gamma(context)
|
||||
return alpha, gamma
|
||||
|
||||
def _gamma(self, context):
|
||||
n_plus = _count_values_gt_zero(self.counts[context])
|
||||
return (self.discount * n_plus) / self.counts[context].N()
|
||||
|
||||
def unigram_score(self, word):
|
||||
return self.counts.unigrams.freq(word)
|
||||
|
||||
|
||||
class KneserNey(Smoothing):
|
||||
"""Kneser-Ney Smoothing.
|
||||
|
||||
This is an extension of smoothing with a discount.
|
||||
|
||||
Resources:
|
||||
- https://pages.ucsd.edu/~rlevy/lign256/winter2008/kneser_ney_mini_example.pdf
|
||||
- https://www.youtube.com/watch?v=ody1ysUTD7o
|
||||
- https://medium.com/@dennyc/a-simple-numerical-example-for-kneser-ney-smoothing-nlp-4600addf38b8
|
||||
- https://www.cl.uni-heidelberg.de/courses/ss15/smt/scribe6.pdf
|
||||
- https://www-i6.informatik.rwth-aachen.de/publications/download/951/Kneser-ICASSP-1995.pdf
|
||||
"""
|
||||
|
||||
def __init__(self, vocabulary, counter, order, discount=0.1, **kwargs):
|
||||
super().__init__(vocabulary, counter, **kwargs)
|
||||
self.discount = discount
|
||||
self._order = order
|
||||
|
||||
def unigram_score(self, word):
|
||||
word_continuation_count, total_count = self._continuation_counts(word)
|
||||
return word_continuation_count / total_count
|
||||
|
||||
def alpha_gamma(self, word, context):
|
||||
prefix_counts = self.counts[context]
|
||||
word_continuation_count, total_count = (
|
||||
(prefix_counts[word], prefix_counts.N())
|
||||
if len(context) + 1 == self._order
|
||||
else self._continuation_counts(word, context)
|
||||
)
|
||||
alpha = max(word_continuation_count - self.discount, 0.0) / total_count
|
||||
gamma = self.discount * _count_values_gt_zero(prefix_counts) / total_count
|
||||
return alpha, gamma
|
||||
|
||||
def _continuation_counts(self, word, context=tuple()):
|
||||
"""Count continuations that end with context and word.
|
||||
|
||||
Continuations track unique ngram "types", regardless of how many
|
||||
instances were observed for each "type".
|
||||
This is different than raw ngram counts which track number of instances.
|
||||
"""
|
||||
higher_order_ngrams_with_context = (
|
||||
counts
|
||||
for prefix_ngram, counts in self.counts[len(context) + 2].items()
|
||||
if prefix_ngram[1:] == context
|
||||
)
|
||||
higher_order_ngrams_with_word_count, total = 0, 0
|
||||
for counts in higher_order_ngrams_with_context:
|
||||
higher_order_ngrams_with_word_count += int(counts[word] > 0)
|
||||
total += _count_values_gt_zero(counts)
|
||||
return higher_order_ngrams_with_word_count, total
|
||||
19
backend/venv/Lib/site-packages/nltk/lm/util.py
Normal file
19
backend/venv/Lib/site-packages/nltk/lm/util.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# Natural Language Toolkit
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Language Model Utilities"""
|
||||
|
||||
from math import log
|
||||
|
||||
NEG_INF = float("-inf")
|
||||
POS_INF = float("inf")
|
||||
|
||||
|
||||
def log_base2(score):
|
||||
"""Convenience function for computing logarithms with base 2."""
|
||||
if score == 0.0:
|
||||
return NEG_INF
|
||||
return log(score, 2)
|
||||
218
backend/venv/Lib/site-packages/nltk/lm/vocabulary.py
Normal file
218
backend/venv/Lib/site-packages/nltk/lm/vocabulary.py
Normal file
@@ -0,0 +1,218 @@
|
||||
# Natural Language Toolkit
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Language Model Vocabulary"""
|
||||
|
||||
import sys
|
||||
from collections import Counter
|
||||
from collections.abc import Iterable
|
||||
from functools import singledispatch
|
||||
from itertools import chain
|
||||
|
||||
|
||||
@singledispatch
|
||||
def _dispatched_lookup(words, vocab):
|
||||
raise TypeError(f"Unsupported type for looking up in vocabulary: {type(words)}")
|
||||
|
||||
|
||||
@_dispatched_lookup.register(Iterable)
|
||||
def _(words, vocab):
|
||||
"""Look up a sequence of words in the vocabulary.
|
||||
|
||||
Returns an iterator over looked up words.
|
||||
|
||||
"""
|
||||
return tuple(_dispatched_lookup(w, vocab) for w in words)
|
||||
|
||||
|
||||
@_dispatched_lookup.register(str)
|
||||
def _string_lookup(word, vocab):
|
||||
"""Looks up one word in the vocabulary."""
|
||||
return word if word in vocab else vocab.unk_label
|
||||
|
||||
|
||||
class Vocabulary:
|
||||
"""Stores language model vocabulary.
|
||||
|
||||
Satisfies two common language modeling requirements for a vocabulary:
|
||||
|
||||
- When checking membership and calculating its size, filters items
|
||||
by comparing their counts to a cutoff value.
|
||||
- Adds a special "unknown" token which unseen words are mapped to.
|
||||
|
||||
>>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
|
||||
>>> from nltk.lm import Vocabulary
|
||||
>>> vocab = Vocabulary(words, unk_cutoff=2)
|
||||
|
||||
Tokens with counts greater than or equal to the cutoff value will
|
||||
be considered part of the vocabulary.
|
||||
|
||||
>>> vocab['c']
|
||||
3
|
||||
>>> 'c' in vocab
|
||||
True
|
||||
>>> vocab['d']
|
||||
2
|
||||
>>> 'd' in vocab
|
||||
True
|
||||
|
||||
Tokens with frequency counts less than the cutoff value will be considered not
|
||||
part of the vocabulary even though their entries in the count dictionary are
|
||||
preserved.
|
||||
|
||||
>>> vocab['b']
|
||||
1
|
||||
>>> 'b' in vocab
|
||||
False
|
||||
>>> vocab['aliens']
|
||||
0
|
||||
>>> 'aliens' in vocab
|
||||
False
|
||||
|
||||
Keeping the count entries for seen words allows us to change the cutoff value
|
||||
without having to recalculate the counts.
|
||||
|
||||
>>> vocab2 = Vocabulary(vocab.counts, unk_cutoff=1)
|
||||
>>> "b" in vocab2
|
||||
True
|
||||
|
||||
The cutoff value influences not only membership checking but also the result of
|
||||
getting the size of the vocabulary using the built-in `len`.
|
||||
Note that while the number of keys in the vocabulary's counter stays the same,
|
||||
the items in the vocabulary differ depending on the cutoff.
|
||||
We use `sorted` to demonstrate because it keeps the order consistent.
|
||||
|
||||
>>> sorted(vocab2.counts)
|
||||
['-', 'a', 'b', 'c', 'd', 'r']
|
||||
>>> sorted(vocab2)
|
||||
['-', '<UNK>', 'a', 'b', 'c', 'd', 'r']
|
||||
>>> sorted(vocab.counts)
|
||||
['-', 'a', 'b', 'c', 'd', 'r']
|
||||
>>> sorted(vocab)
|
||||
['<UNK>', 'a', 'c', 'd']
|
||||
|
||||
In addition to items it gets populated with, the vocabulary stores a special
|
||||
token that stands in for so-called "unknown" items. By default it's "<UNK>".
|
||||
|
||||
>>> "<UNK>" in vocab
|
||||
True
|
||||
|
||||
We can look up words in a vocabulary using its `lookup` method.
|
||||
"Unseen" words (with counts less than cutoff) are looked up as the unknown label.
|
||||
If given one word (a string) as an input, this method will return a string.
|
||||
|
||||
>>> vocab.lookup("a")
|
||||
'a'
|
||||
>>> vocab.lookup("aliens")
|
||||
'<UNK>'
|
||||
|
||||
If given a sequence, it will return an tuple of the looked up words.
|
||||
|
||||
>>> vocab.lookup(["p", 'a', 'r', 'd', 'b', 'c'])
|
||||
('<UNK>', 'a', '<UNK>', 'd', '<UNK>', 'c')
|
||||
|
||||
It's possible to update the counts after the vocabulary has been created.
|
||||
In general, the interface is the same as that of `collections.Counter`.
|
||||
|
||||
>>> vocab['b']
|
||||
1
|
||||
>>> vocab.update(["b", "b", "c"])
|
||||
>>> vocab['b']
|
||||
3
|
||||
"""
|
||||
|
||||
def __init__(self, counts=None, unk_cutoff=1, unk_label="<UNK>"):
|
||||
"""Create a new Vocabulary.
|
||||
|
||||
:param counts: Optional iterable or `collections.Counter` instance to
|
||||
pre-seed the Vocabulary. In case it is iterable, counts
|
||||
are calculated.
|
||||
:param int unk_cutoff: Words that occur less frequently than this value
|
||||
are not considered part of the vocabulary.
|
||||
:param unk_label: Label for marking words not part of vocabulary.
|
||||
|
||||
"""
|
||||
self.unk_label = unk_label
|
||||
if unk_cutoff < 1:
|
||||
raise ValueError(f"Cutoff value cannot be less than 1. Got: {unk_cutoff}")
|
||||
self._cutoff = unk_cutoff
|
||||
|
||||
self.counts = Counter()
|
||||
self.update(counts if counts is not None else "")
|
||||
|
||||
@property
|
||||
def cutoff(self):
|
||||
"""Cutoff value.
|
||||
|
||||
Items with count below this value are not considered part of vocabulary.
|
||||
|
||||
"""
|
||||
return self._cutoff
|
||||
|
||||
def update(self, *counter_args, **counter_kwargs):
|
||||
"""Update vocabulary counts.
|
||||
|
||||
Wraps `collections.Counter.update` method.
|
||||
|
||||
"""
|
||||
self.counts.update(*counter_args, **counter_kwargs)
|
||||
self._len = sum(1 for _ in self)
|
||||
|
||||
def lookup(self, words):
|
||||
"""Look up one or more words in the vocabulary.
|
||||
|
||||
If passed one word as a string will return that word or `self.unk_label`.
|
||||
Otherwise will assume it was passed a sequence of words, will try to look
|
||||
each of them up and return an iterator over the looked up words.
|
||||
|
||||
:param words: Word(s) to look up.
|
||||
:type words: Iterable(str) or str
|
||||
:rtype: generator(str) or str
|
||||
:raises: TypeError for types other than strings or iterables
|
||||
|
||||
>>> from nltk.lm import Vocabulary
|
||||
>>> vocab = Vocabulary(["a", "b", "c", "a", "b"], unk_cutoff=2)
|
||||
>>> vocab.lookup("a")
|
||||
'a'
|
||||
>>> vocab.lookup("aliens")
|
||||
'<UNK>'
|
||||
>>> vocab.lookup(["a", "b", "c", ["x", "b"]])
|
||||
('a', 'b', '<UNK>', ('<UNK>', 'b'))
|
||||
|
||||
"""
|
||||
return _dispatched_lookup(words, self)
|
||||
|
||||
def __getitem__(self, item):
|
||||
return self._cutoff if item == self.unk_label else self.counts[item]
|
||||
|
||||
def __contains__(self, item):
|
||||
"""Only consider items with counts GE to cutoff as being in the
|
||||
vocabulary."""
|
||||
return self[item] >= self.cutoff
|
||||
|
||||
def __iter__(self):
|
||||
"""Building on membership check define how to iterate over
|
||||
vocabulary."""
|
||||
return chain(
|
||||
(item for item in self.counts if item in self),
|
||||
[self.unk_label] if self.counts else [],
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
"""Computing size of vocabulary reflects the cutoff."""
|
||||
return self._len
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
self.unk_label == other.unk_label
|
||||
and self.cutoff == other.cutoff
|
||||
and self.counts == other.counts
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return "<{} with cutoff={} unk_label='{}' and {} items>".format(
|
||||
self.__class__.__name__, self.cutoff, self.unk_label, len(self)
|
||||
)
|
||||
Reference in New Issue
Block a user