Initial commit
This commit is contained in:
33
backend/venv/Lib/site-packages/nltk/translate/__init__.py
Normal file
33
backend/venv/Lib/site-packages/nltk/translate/__init__.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# Natural Language Toolkit: Machine Translation
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>, Tah Wei Hoon <hoon.tw@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Experimental features for machine translation.
|
||||
These interfaces are prone to change.
|
||||
|
||||
isort:skip_file
|
||||
"""
|
||||
|
||||
from nltk.translate.api import AlignedSent, Alignment, PhraseTable
|
||||
from nltk.translate.ibm_model import IBMModel
|
||||
from nltk.translate.ibm1 import IBMModel1
|
||||
from nltk.translate.ibm2 import IBMModel2
|
||||
from nltk.translate.ibm3 import IBMModel3
|
||||
from nltk.translate.ibm4 import IBMModel4
|
||||
from nltk.translate.ibm5 import IBMModel5
|
||||
from nltk.translate.bleu_score import sentence_bleu as bleu
|
||||
from nltk.translate.ribes_score import sentence_ribes as ribes
|
||||
from nltk.translate.meteor_score import meteor_score as meteor
|
||||
from nltk.translate.metrics import alignment_error_rate
|
||||
from nltk.translate.stack_decoder import StackDecoder
|
||||
from nltk.translate.nist_score import sentence_nist as nist
|
||||
from nltk.translate.chrf_score import sentence_chrf as chrf
|
||||
from nltk.translate.gale_church import trace
|
||||
from nltk.translate.gdfa import grow_diag_final_and
|
||||
from nltk.translate.gleu_score import sentence_gleu as gleu
|
||||
from nltk.translate.phrase_based import extract
|
||||
from nltk.translate.lepor import sentence_lepor as lepor, corpus_lepor
|
||||
335
backend/venv/Lib/site-packages/nltk/translate/api.py
Normal file
335
backend/venv/Lib/site-packages/nltk/translate/api.py
Normal file
@@ -0,0 +1,335 @@
|
||||
# Natural Language Toolkit: API for alignment and translation objects
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Will Zhang <wilzzha@gmail.com>
|
||||
# Guan Gui <ggui@student.unimelb.edu.au>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Tah Wei Hoon <hoon.tw@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import subprocess
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
class AlignedSent:
|
||||
"""
|
||||
Return an aligned sentence object, which encapsulates two sentences
|
||||
along with an ``Alignment`` between them.
|
||||
|
||||
Typically used in machine translation to represent a sentence and
|
||||
its translation.
|
||||
|
||||
>>> from nltk.translate import AlignedSent, Alignment
|
||||
>>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'],
|
||||
... ['the', 'house', 'is', 'small'], Alignment.fromstring('0-3 1-2 2-0 3-1'))
|
||||
>>> algnsent.words
|
||||
['klein', 'ist', 'das', 'Haus']
|
||||
>>> algnsent.mots
|
||||
['the', 'house', 'is', 'small']
|
||||
>>> algnsent.alignment
|
||||
Alignment([(0, 3), (1, 2), (2, 0), (3, 1)])
|
||||
>>> from nltk.corpus import comtrans
|
||||
>>> print(comtrans.aligned_sents()[54])
|
||||
<AlignedSent: 'Weshalb also sollten...' -> 'So why should EU arm...'>
|
||||
>>> print(comtrans.aligned_sents()[54].alignment)
|
||||
0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13
|
||||
|
||||
:param words: Words in the target language sentence
|
||||
:type words: list(str)
|
||||
:param mots: Words in the source language sentence
|
||||
:type mots: list(str)
|
||||
:param alignment: Word-level alignments between ``words`` and ``mots``.
|
||||
Each alignment is represented as a 2-tuple (words_index, mots_index).
|
||||
:type alignment: Alignment
|
||||
"""
|
||||
|
||||
def __init__(self, words, mots, alignment=None):
|
||||
self._words = words
|
||||
self._mots = mots
|
||||
if alignment is None:
|
||||
self.alignment = Alignment([])
|
||||
else:
|
||||
assert type(alignment) is Alignment
|
||||
self.alignment = alignment
|
||||
|
||||
@property
|
||||
def words(self):
|
||||
return self._words
|
||||
|
||||
@property
|
||||
def mots(self):
|
||||
return self._mots
|
||||
|
||||
def _get_alignment(self):
|
||||
return self._alignment
|
||||
|
||||
def _set_alignment(self, alignment):
|
||||
_check_alignment(len(self.words), len(self.mots), alignment)
|
||||
self._alignment = alignment
|
||||
|
||||
alignment = property(_get_alignment, _set_alignment)
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Return a string representation for this ``AlignedSent``.
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
words = "[%s]" % (", ".join("'%s'" % w for w in self._words))
|
||||
mots = "[%s]" % (", ".join("'%s'" % w for w in self._mots))
|
||||
|
||||
return f"AlignedSent({words}, {mots}, {self._alignment!r})"
|
||||
|
||||
def _to_dot(self):
|
||||
"""
|
||||
Dot representation of the aligned sentence
|
||||
"""
|
||||
s = "graph align {\n"
|
||||
s += "node[shape=plaintext]\n"
|
||||
|
||||
# Declare node
|
||||
s += "".join([f'"{w}_source" [label="{w}"] \n' for w in self._words])
|
||||
s += "".join([f'"{w}_target" [label="{w}"] \n' for w in self._mots])
|
||||
|
||||
# Alignment
|
||||
s += "".join(
|
||||
[
|
||||
f'"{self._words[u]}_source" -- "{self._mots[v]}_target" \n'
|
||||
for u, v in self._alignment
|
||||
]
|
||||
)
|
||||
|
||||
# Connect the source words
|
||||
for i in range(len(self._words) - 1):
|
||||
s += '"{}_source" -- "{}_source" [style=invis]\n'.format(
|
||||
self._words[i],
|
||||
self._words[i + 1],
|
||||
)
|
||||
|
||||
# Connect the target words
|
||||
for i in range(len(self._mots) - 1):
|
||||
s += '"{}_target" -- "{}_target" [style=invis]\n'.format(
|
||||
self._mots[i],
|
||||
self._mots[i + 1],
|
||||
)
|
||||
|
||||
# Put it in the same rank
|
||||
s += "{rank = same; %s}\n" % (" ".join('"%s_source"' % w for w in self._words))
|
||||
s += "{rank = same; %s}\n" % (" ".join('"%s_target"' % w for w in self._mots))
|
||||
|
||||
s += "}"
|
||||
|
||||
return s
|
||||
|
||||
def _repr_svg_(self):
|
||||
"""
|
||||
Ipython magic : show SVG representation of this ``AlignedSent``.
|
||||
"""
|
||||
dot_string = self._to_dot().encode("utf8")
|
||||
output_format = "svg"
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
["dot", "-T%s" % output_format],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
except OSError as e:
|
||||
raise Exception("Cannot find the dot binary from Graphviz package") from e
|
||||
out, err = process.communicate(dot_string)
|
||||
|
||||
return out.decode("utf8")
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
Return a human-readable string representation for this ``AlignedSent``.
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
source = " ".join(self._words)[:20] + "..."
|
||||
target = " ".join(self._mots)[:20] + "..."
|
||||
return f"<AlignedSent: '{source}' -> '{target}'>"
|
||||
|
||||
def invert(self):
|
||||
"""
|
||||
Return the aligned sentence pair, reversing the directionality
|
||||
|
||||
:rtype: AlignedSent
|
||||
"""
|
||||
return AlignedSent(self._mots, self._words, self._alignment.invert())
|
||||
|
||||
|
||||
class Alignment(frozenset):
|
||||
"""
|
||||
A storage class for representing alignment between two sequences, s1, s2.
|
||||
In general, an alignment is a set of tuples of the form (i, j, ...)
|
||||
representing an alignment between the i-th element of s1 and the
|
||||
j-th element of s2. Tuples are extensible (they might contain
|
||||
additional data, such as a boolean to indicate sure vs possible alignments).
|
||||
|
||||
>>> from nltk.translate import Alignment
|
||||
>>> a = Alignment([(0, 0), (0, 1), (1, 2), (2, 2)])
|
||||
>>> a.invert()
|
||||
Alignment([(0, 0), (1, 0), (2, 1), (2, 2)])
|
||||
>>> print(a.invert())
|
||||
0-0 1-0 2-1 2-2
|
||||
>>> a[0]
|
||||
[(0, 1), (0, 0)]
|
||||
>>> a.invert()[2]
|
||||
[(2, 1), (2, 2)]
|
||||
>>> b = Alignment([(0, 0), (0, 1)])
|
||||
>>> b.issubset(a)
|
||||
True
|
||||
>>> c = Alignment.fromstring('0-0 0-1')
|
||||
>>> b == c
|
||||
True
|
||||
"""
|
||||
|
||||
def __new__(cls, pairs):
|
||||
self = frozenset.__new__(cls, pairs)
|
||||
self._len = max(p[0] for p in self) if self != frozenset([]) else 0
|
||||
self._index = None
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def fromstring(cls, s):
|
||||
"""
|
||||
Read a giza-formatted string and return an Alignment object.
|
||||
|
||||
>>> Alignment.fromstring('0-0 2-1 9-2 21-3 10-4 7-5')
|
||||
Alignment([(0, 0), (2, 1), (7, 5), (9, 2), (10, 4), (21, 3)])
|
||||
|
||||
:type s: str
|
||||
:param s: the positional alignments in giza format
|
||||
:rtype: Alignment
|
||||
:return: An Alignment object corresponding to the string representation ``s``.
|
||||
"""
|
||||
|
||||
return Alignment([_giza2pair(a) for a in s.split()])
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""
|
||||
Look up the alignments that map from a given index or slice.
|
||||
"""
|
||||
if not self._index:
|
||||
self._build_index()
|
||||
return self._index.__getitem__(key)
|
||||
|
||||
def invert(self):
|
||||
"""
|
||||
Return an Alignment object, being the inverted mapping.
|
||||
"""
|
||||
return Alignment(((p[1], p[0]) + p[2:]) for p in self)
|
||||
|
||||
def range(self, positions=None):
|
||||
"""
|
||||
Work out the range of the mapping from the given positions.
|
||||
If no positions are specified, compute the range of the entire mapping.
|
||||
"""
|
||||
image = set()
|
||||
if not self._index:
|
||||
self._build_index()
|
||||
if not positions:
|
||||
positions = list(range(len(self._index)))
|
||||
for p in positions:
|
||||
image.update(f for _, f in self._index[p])
|
||||
return sorted(image)
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Produce a Giza-formatted string representing the alignment.
|
||||
"""
|
||||
return "Alignment(%r)" % sorted(self)
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
Produce a Giza-formatted string representing the alignment.
|
||||
"""
|
||||
return " ".join("%d-%d" % p[:2] for p in sorted(self))
|
||||
|
||||
def _build_index(self):
|
||||
"""
|
||||
Build a list self._index such that self._index[i] is a list
|
||||
of the alignments originating from word i.
|
||||
"""
|
||||
self._index = [[] for _ in range(self._len + 1)]
|
||||
for p in self:
|
||||
self._index[p[0]].append(p)
|
||||
|
||||
|
||||
def _giza2pair(pair_string):
|
||||
i, j = pair_string.split("-")
|
||||
return int(i), int(j)
|
||||
|
||||
|
||||
def _naacl2pair(pair_string):
|
||||
i, j, p = pair_string.split("-")
|
||||
return int(i), int(j)
|
||||
|
||||
|
||||
def _check_alignment(num_words, num_mots, alignment):
|
||||
"""
|
||||
Check whether the alignments are legal.
|
||||
|
||||
:param num_words: the number of source language words
|
||||
:type num_words: int
|
||||
:param num_mots: the number of target language words
|
||||
:type num_mots: int
|
||||
:param alignment: alignment to be checked
|
||||
:type alignment: Alignment
|
||||
:raise IndexError: if alignment falls outside the sentence
|
||||
"""
|
||||
|
||||
assert type(alignment) is Alignment
|
||||
|
||||
if not all(0 <= pair[0] < num_words for pair in alignment):
|
||||
raise IndexError("Alignment is outside boundary of words")
|
||||
if not all(pair[1] is None or 0 <= pair[1] < num_mots for pair in alignment):
|
||||
raise IndexError("Alignment is outside boundary of mots")
|
||||
|
||||
|
||||
PhraseTableEntry = namedtuple("PhraseTableEntry", ["trg_phrase", "log_prob"])
|
||||
|
||||
|
||||
class PhraseTable:
|
||||
"""
|
||||
In-memory store of translations for a given phrase, and the log
|
||||
probability of the those translations
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.src_phrases = dict()
|
||||
|
||||
def translations_for(self, src_phrase):
|
||||
"""
|
||||
Get the translations for a source language phrase
|
||||
|
||||
:param src_phrase: Source language phrase of interest
|
||||
:type src_phrase: tuple(str)
|
||||
|
||||
:return: A list of target language phrases that are translations
|
||||
of ``src_phrase``, ordered in decreasing order of
|
||||
likelihood. Each list element is a tuple of the target
|
||||
phrase and its log probability.
|
||||
:rtype: list(PhraseTableEntry)
|
||||
"""
|
||||
return self.src_phrases[src_phrase]
|
||||
|
||||
def add(self, src_phrase, trg_phrase, log_prob):
|
||||
"""
|
||||
:type src_phrase: tuple(str)
|
||||
:type trg_phrase: tuple(str)
|
||||
|
||||
:param log_prob: Log probability that given ``src_phrase``,
|
||||
``trg_phrase`` is its translation
|
||||
:type log_prob: float
|
||||
"""
|
||||
entry = PhraseTableEntry(trg_phrase=trg_phrase, log_prob=log_prob)
|
||||
if src_phrase not in self.src_phrases:
|
||||
self.src_phrases[src_phrase] = []
|
||||
self.src_phrases[src_phrase].append(entry)
|
||||
self.src_phrases[src_phrase].sort(key=lambda e: e.log_prob, reverse=True)
|
||||
|
||||
def __contains__(self, src_phrase):
|
||||
return src_phrase in self.src_phrases
|
||||
714
backend/venv/Lib/site-packages/nltk/translate/bleu_score.py
Normal file
714
backend/venv/Lib/site-packages/nltk/translate/bleu_score.py
Normal file
@@ -0,0 +1,714 @@
|
||||
# Natural Language Toolkit: BLEU Score
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
|
||||
# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""BLEU score implementation."""
|
||||
import math
|
||||
import sys
|
||||
import warnings
|
||||
from collections import Counter
|
||||
from fractions import Fraction as _Fraction
|
||||
|
||||
from nltk.util import ngrams
|
||||
|
||||
|
||||
class Fraction(_Fraction):
|
||||
"""Fraction with _normalize=False support for 3.12"""
|
||||
|
||||
def __new__(cls, numerator=0, denominator=None, _normalize=False):
|
||||
if sys.version_info >= (3, 12):
|
||||
self = super().__new__(cls, numerator, denominator)
|
||||
else:
|
||||
self = super().__new__(cls, numerator, denominator, _normalize=_normalize)
|
||||
self._normalize = _normalize
|
||||
self._original_numerator = numerator
|
||||
self._original_denominator = denominator
|
||||
return self
|
||||
|
||||
@property
|
||||
def numerator(self):
|
||||
if not self._normalize:
|
||||
return self._original_numerator
|
||||
return super().numerator
|
||||
|
||||
@property
|
||||
def denominator(self):
|
||||
if not self._normalize:
|
||||
return self._original_denominator
|
||||
return super().denominator
|
||||
|
||||
|
||||
def sentence_bleu(
|
||||
references,
|
||||
hypothesis,
|
||||
weights=(0.25, 0.25, 0.25, 0.25),
|
||||
smoothing_function=None,
|
||||
auto_reweigh=False,
|
||||
):
|
||||
"""
|
||||
Calculate BLEU score (Bilingual Evaluation Understudy) from
|
||||
Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
|
||||
"BLEU: a method for automatic evaluation of machine translation."
|
||||
In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf
|
||||
|
||||
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||||
... 'ensures', 'that', 'the', 'military', 'always',
|
||||
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
|
||||
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
|
||||
... 'forever', 'hearing', 'the', 'activity', 'guidebook',
|
||||
... 'that', 'party', 'direct']
|
||||
|
||||
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||||
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
||||
... 'heed', 'Party', 'commands']
|
||||
|
||||
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||||
... 'guarantees', 'the', 'military', 'forces', 'always',
|
||||
... 'being', 'under', 'the', 'command', 'of', 'the',
|
||||
... 'Party']
|
||||
|
||||
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||||
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||||
... 'of', 'the', 'party']
|
||||
|
||||
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
|
||||
0.5045...
|
||||
|
||||
If there is no ngrams overlap for any order of n-grams, BLEU returns the
|
||||
value 0. This is because the precision for the order of n-grams without
|
||||
overlap is 0, and the geometric mean in the final BLEU score computation
|
||||
multiplies the 0 with the precision of other n-grams. This results in 0
|
||||
(independently of the precision of the other n-gram orders). The following
|
||||
example has zero 3-gram and 4-gram overlaps:
|
||||
|
||||
>>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS
|
||||
0.0
|
||||
|
||||
To avoid this harsh behaviour when no ngram overlaps are found a smoothing
|
||||
function can be used.
|
||||
|
||||
>>> chencherry = SmoothingFunction()
|
||||
>>> sentence_bleu([reference1, reference2, reference3], hypothesis2,
|
||||
... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS
|
||||
0.0370...
|
||||
|
||||
The default BLEU calculates a score for up to 4-grams using uniform
|
||||
weights (this is called BLEU-4). To evaluate your translations with
|
||||
higher/lower order ngrams, use customized weights. E.g. when accounting
|
||||
for up to 5-grams with uniform weights (this is called BLEU-5) use:
|
||||
|
||||
>>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.)
|
||||
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
|
||||
0.3920...
|
||||
|
||||
Multiple BLEU scores can be computed at once, by supplying a list of weights.
|
||||
E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use:
|
||||
>>> weights = [
|
||||
... (1./2., 1./2.),
|
||||
... (1./3., 1./3., 1./3.),
|
||||
... (1./4., 1./4., 1./4., 1./4.)
|
||||
... ]
|
||||
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
|
||||
[0.7453..., 0.6240..., 0.5045...]
|
||||
|
||||
:param references: reference sentences
|
||||
:type references: list(list(str))
|
||||
:param hypothesis: a hypothesis sentence
|
||||
:type hypothesis: list(str)
|
||||
:param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights)
|
||||
:type weights: tuple(float) / list(tuple(float))
|
||||
:param smoothing_function:
|
||||
:type smoothing_function: SmoothingFunction
|
||||
:param auto_reweigh: Option to re-normalize the weights uniformly.
|
||||
:type auto_reweigh: bool
|
||||
:return: The sentence-level BLEU score. Returns a list if multiple weights were supplied.
|
||||
:rtype: float / list(float)
|
||||
"""
|
||||
return corpus_bleu(
|
||||
[references], [hypothesis], weights, smoothing_function, auto_reweigh
|
||||
)
|
||||
|
||||
|
||||
def corpus_bleu(
|
||||
list_of_references,
|
||||
hypotheses,
|
||||
weights=(0.25, 0.25, 0.25, 0.25),
|
||||
smoothing_function=None,
|
||||
auto_reweigh=False,
|
||||
):
|
||||
"""
|
||||
Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
|
||||
the hypotheses and their respective references.
|
||||
|
||||
Instead of averaging the sentence level BLEU scores (i.e. macro-average
|
||||
precision), the original BLEU metric (Papineni et al. 2002) accounts for
|
||||
the micro-average precision (i.e. summing the numerators and denominators
|
||||
for each hypothesis-reference(s) pairs before the division).
|
||||
|
||||
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||||
... 'ensures', 'that', 'the', 'military', 'always',
|
||||
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
>>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||||
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
||||
... 'heed', 'Party', 'commands']
|
||||
>>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||||
... 'guarantees', 'the', 'military', 'forces', 'always',
|
||||
... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
|
||||
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||||
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||||
... 'of', 'the', 'party']
|
||||
|
||||
>>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
|
||||
... 'interested', 'in', 'world', 'history']
|
||||
>>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
|
||||
... 'because', 'he', 'read', 'the', 'book']
|
||||
|
||||
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
|
||||
>>> hypotheses = [hyp1, hyp2]
|
||||
>>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
|
||||
0.5920...
|
||||
|
||||
The example below show that corpus_bleu() is different from averaging
|
||||
sentence_bleu() for hypotheses
|
||||
|
||||
>>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
|
||||
>>> score2 = sentence_bleu([ref2a], hyp2)
|
||||
>>> (score1 + score2) / 2 # doctest: +ELLIPSIS
|
||||
0.6223...
|
||||
|
||||
Custom weights may be supplied to fine-tune the BLEU score further.
|
||||
A tuple of float weights for unigrams, bigrams, trigrams and so on can be given.
|
||||
>>> weights = (0.1, 0.3, 0.5, 0.1)
|
||||
>>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS
|
||||
0.5818...
|
||||
|
||||
This particular weight gave extra value to trigrams.
|
||||
Furthermore, multiple weights can be given, resulting in multiple BLEU scores.
|
||||
>>> weights = [
|
||||
... (0.5, 0.5),
|
||||
... (0.333, 0.333, 0.334),
|
||||
... (0.25, 0.25, 0.25, 0.25),
|
||||
... (0.2, 0.2, 0.2, 0.2, 0.2)
|
||||
... ]
|
||||
>>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS
|
||||
[0.8242..., 0.7067..., 0.5920..., 0.4719...]
|
||||
|
||||
:param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
|
||||
:type list_of_references: list(list(list(str)))
|
||||
:param hypotheses: a list of hypothesis sentences
|
||||
:type hypotheses: list(list(str))
|
||||
:param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights)
|
||||
:type weights: tuple(float) / list(tuple(float))
|
||||
:param smoothing_function:
|
||||
:type smoothing_function: SmoothingFunction
|
||||
:param auto_reweigh: Option to re-normalize the weights uniformly.
|
||||
:type auto_reweigh: bool
|
||||
:return: The corpus-level BLEU score.
|
||||
:rtype: float
|
||||
"""
|
||||
# Before proceeding to compute BLEU, perform sanity checks.
|
||||
|
||||
p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
|
||||
p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
|
||||
hyp_lengths, ref_lengths = 0, 0
|
||||
|
||||
assert len(list_of_references) == len(hypotheses), (
|
||||
"The number of hypotheses and their reference(s) should be the " "same "
|
||||
)
|
||||
|
||||
try:
|
||||
weights[0][0]
|
||||
except:
|
||||
weights = [weights]
|
||||
max_weight_length = max(len(weight) for weight in weights)
|
||||
|
||||
# Iterate through each hypothesis and their corresponding references.
|
||||
for references, hypothesis in zip(list_of_references, hypotheses):
|
||||
# For each order of ngram, calculate the numerator and
|
||||
# denominator for the corpus-level modified precision.
|
||||
for i in range(1, max_weight_length + 1):
|
||||
p_i = modified_precision(references, hypothesis, i)
|
||||
p_numerators[i] += p_i.numerator
|
||||
p_denominators[i] += p_i.denominator
|
||||
|
||||
# Calculate the hypothesis length and the closest reference length.
|
||||
# Adds them to the corpus-level hypothesis and reference counts.
|
||||
hyp_len = len(hypothesis)
|
||||
hyp_lengths += hyp_len
|
||||
ref_lengths += closest_ref_length(references, hyp_len)
|
||||
|
||||
# Calculate corpus-level brevity penalty.
|
||||
bp = brevity_penalty(ref_lengths, hyp_lengths)
|
||||
|
||||
# Collects the various precision values for the different ngram orders.
|
||||
p_n = [
|
||||
Fraction(p_numerators[i], p_denominators[i], _normalize=False)
|
||||
for i in range(1, max_weight_length + 1)
|
||||
]
|
||||
|
||||
# Returns 0 if there's no matching n-grams
|
||||
# We only need to check for p_numerators[1] == 0, since if there's
|
||||
# no unigrams, there won't be any higher order ngrams.
|
||||
if p_numerators[1] == 0:
|
||||
return 0 if len(weights) == 1 else [0] * len(weights)
|
||||
|
||||
# If there's no smoothing, set use method0 from SmoothinFunction class.
|
||||
if not smoothing_function:
|
||||
smoothing_function = SmoothingFunction().method0
|
||||
# Smoothen the modified precision.
|
||||
# Note: smoothing_function() may convert values into floats;
|
||||
# it tries to retain the Fraction object as much as the
|
||||
# smoothing method allows.
|
||||
p_n = smoothing_function(
|
||||
p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
|
||||
)
|
||||
|
||||
bleu_scores = []
|
||||
for weight in weights:
|
||||
# Uniformly re-weighting based on maximum hypothesis lengths if largest
|
||||
# order of n-grams < 4 and weights is set at default.
|
||||
if auto_reweigh:
|
||||
if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25):
|
||||
weight = (1 / hyp_lengths,) * hyp_lengths
|
||||
|
||||
s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0)
|
||||
s = bp * math.exp(math.fsum(s))
|
||||
bleu_scores.append(s)
|
||||
return bleu_scores[0] if len(weights) == 1 else bleu_scores
|
||||
|
||||
|
||||
def modified_precision(references, hypothesis, n):
|
||||
"""
|
||||
Calculate modified ngram precision.
|
||||
|
||||
The normal precision method may lead to some wrong translations with
|
||||
high-precision, e.g., the translation, in which a word of reference
|
||||
repeats several times, has very high precision.
|
||||
|
||||
This function only returns the Fraction object that contains the numerator
|
||||
and denominator necessary to calculate the corpus-level precision.
|
||||
To calculate the modified precision for a single pair of hypothesis and
|
||||
references, cast the Fraction object into a float.
|
||||
|
||||
The famous "the the the ... " example shows that you can get BLEU precision
|
||||
by duplicating high frequency words.
|
||||
|
||||
>>> reference1 = 'the cat is on the mat'.split()
|
||||
>>> reference2 = 'there is a cat on the mat'.split()
|
||||
>>> hypothesis1 = 'the the the the the the the'.split()
|
||||
>>> references = [reference1, reference2]
|
||||
>>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
|
||||
0.2857...
|
||||
|
||||
In the modified n-gram precision, a reference word will be considered
|
||||
exhausted after a matching hypothesis word is identified, e.g.
|
||||
|
||||
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||||
... 'ensures', 'that', 'the', 'military', 'will',
|
||||
... 'forever', 'heed', 'Party', 'commands']
|
||||
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||||
... 'guarantees', 'the', 'military', 'forces', 'always',
|
||||
... 'being', 'under', 'the', 'command', 'of', 'the',
|
||||
... 'Party']
|
||||
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||||
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||||
... 'of', 'the', 'party']
|
||||
>>> hypothesis = 'of the'.split()
|
||||
>>> references = [reference1, reference2, reference3]
|
||||
>>> float(modified_precision(references, hypothesis, n=1))
|
||||
1.0
|
||||
>>> float(modified_precision(references, hypothesis, n=2))
|
||||
1.0
|
||||
|
||||
An example of a normal machine translation hypothesis:
|
||||
|
||||
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||||
... 'ensures', 'that', 'the', 'military', 'always',
|
||||
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
|
||||
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
|
||||
... 'forever', 'hearing', 'the', 'activity', 'guidebook',
|
||||
... 'that', 'party', 'direct']
|
||||
|
||||
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||||
... 'ensures', 'that', 'the', 'military', 'will',
|
||||
... 'forever', 'heed', 'Party', 'commands']
|
||||
|
||||
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||||
... 'guarantees', 'the', 'military', 'forces', 'always',
|
||||
... 'being', 'under', 'the', 'command', 'of', 'the',
|
||||
... 'Party']
|
||||
|
||||
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||||
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||||
... 'of', 'the', 'party']
|
||||
>>> references = [reference1, reference2, reference3]
|
||||
>>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
|
||||
0.9444...
|
||||
>>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
|
||||
0.5714...
|
||||
>>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
|
||||
0.5882352941176471
|
||||
>>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
|
||||
0.07692...
|
||||
|
||||
|
||||
:param references: A list of reference translations.
|
||||
:type references: list(list(str))
|
||||
:param hypothesis: A hypothesis translation.
|
||||
:type hypothesis: list(str)
|
||||
:param n: The ngram order.
|
||||
:type n: int
|
||||
:return: BLEU's modified precision for the nth order ngram.
|
||||
:rtype: Fraction
|
||||
"""
|
||||
# Extracts all ngrams in hypothesis
|
||||
# Set an empty Counter if hypothesis is empty.
|
||||
counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
|
||||
# Extract a union of references' counts.
|
||||
# max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
|
||||
max_counts = {}
|
||||
for reference in references:
|
||||
reference_counts = (
|
||||
Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
|
||||
)
|
||||
for ngram in counts:
|
||||
max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
|
||||
|
||||
# Assigns the intersection between hypothesis and references' counts.
|
||||
clipped_counts = {
|
||||
ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
|
||||
}
|
||||
|
||||
numerator = sum(clipped_counts.values())
|
||||
# Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
|
||||
# Usually this happens when the ngram order is > len(reference).
|
||||
denominator = max(1, sum(counts.values()))
|
||||
|
||||
return Fraction(numerator, denominator, _normalize=False)
|
||||
|
||||
|
||||
def closest_ref_length(references, hyp_len):
|
||||
"""
|
||||
This function finds the reference that is the closest length to the
|
||||
hypothesis. The closest reference length is referred to as *r* variable
|
||||
from the brevity penalty formula in Papineni et. al. (2002)
|
||||
|
||||
:param references: A list of reference translations.
|
||||
:type references: list(list(str))
|
||||
:param hyp_len: The length of the hypothesis.
|
||||
:type hyp_len: int
|
||||
:return: The length of the reference that's closest to the hypothesis.
|
||||
:rtype: int
|
||||
"""
|
||||
ref_lens = (len(reference) for reference in references)
|
||||
closest_ref_len = min(
|
||||
ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)
|
||||
)
|
||||
return closest_ref_len
|
||||
|
||||
|
||||
def brevity_penalty(closest_ref_len, hyp_len):
|
||||
"""
|
||||
Calculate brevity penalty.
|
||||
|
||||
As the modified n-gram precision still has the problem from the short
|
||||
length sentence, brevity penalty is used to modify the overall BLEU
|
||||
score according to length.
|
||||
|
||||
An example from the paper. There are three references with length 12, 15
|
||||
and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
|
||||
|
||||
>>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
|
||||
>>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15
|
||||
>>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
|
||||
>>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
|
||||
>>> references = [reference1, reference2, reference3]
|
||||
>>> hyp_len = len(hypothesis)
|
||||
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||||
>>> brevity_penalty(closest_ref_len, hyp_len)
|
||||
1.0
|
||||
|
||||
In case a hypothesis translation is shorter than the references, penalty is
|
||||
applied.
|
||||
|
||||
>>> references = [['a'] * 28, ['a'] * 28]
|
||||
>>> hypothesis = ['a'] * 12
|
||||
>>> hyp_len = len(hypothesis)
|
||||
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||||
>>> brevity_penalty(closest_ref_len, hyp_len)
|
||||
0.2635971381157267
|
||||
|
||||
The length of the closest reference is used to compute the penalty. If the
|
||||
length of a hypothesis is 12, and the reference lengths are 13 and 2, the
|
||||
penalty is applied because the hypothesis length (12) is less then the
|
||||
closest reference length (13).
|
||||
|
||||
>>> references = [['a'] * 13, ['a'] * 2]
|
||||
>>> hypothesis = ['a'] * 12
|
||||
>>> hyp_len = len(hypothesis)
|
||||
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||||
>>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
|
||||
0.9200...
|
||||
|
||||
The brevity penalty doesn't depend on reference order. More importantly,
|
||||
when two reference sentences are at the same distance, the shortest
|
||||
reference sentence length is used.
|
||||
|
||||
>>> references = [['a'] * 13, ['a'] * 11]
|
||||
>>> hypothesis = ['a'] * 12
|
||||
>>> hyp_len = len(hypothesis)
|
||||
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||||
>>> bp1 = brevity_penalty(closest_ref_len, hyp_len)
|
||||
>>> hyp_len = len(hypothesis)
|
||||
>>> closest_ref_len = closest_ref_length(reversed(references), hyp_len)
|
||||
>>> bp2 = brevity_penalty(closest_ref_len, hyp_len)
|
||||
>>> bp1 == bp2 == 1
|
||||
True
|
||||
|
||||
A test example from mteval-v13a.pl (starting from the line 705):
|
||||
|
||||
>>> references = [['a'] * 11, ['a'] * 8]
|
||||
>>> hypothesis = ['a'] * 7
|
||||
>>> hyp_len = len(hypothesis)
|
||||
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||||
>>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
|
||||
0.8668...
|
||||
|
||||
>>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
|
||||
>>> hypothesis = ['a'] * 7
|
||||
>>> hyp_len = len(hypothesis)
|
||||
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||||
>>> brevity_penalty(closest_ref_len, hyp_len)
|
||||
1.0
|
||||
|
||||
:param hyp_len: The length of the hypothesis for a single sentence OR the
|
||||
sum of all the hypotheses' lengths for a corpus
|
||||
:type hyp_len: int
|
||||
:param closest_ref_len: The length of the closest reference for a single
|
||||
hypothesis OR the sum of all the closest references for every hypotheses.
|
||||
:type closest_ref_len: int
|
||||
:return: BLEU's brevity penalty.
|
||||
:rtype: float
|
||||
"""
|
||||
if hyp_len > closest_ref_len:
|
||||
return 1
|
||||
# If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
|
||||
elif hyp_len == 0:
|
||||
return 0
|
||||
else:
|
||||
return math.exp(1 - closest_ref_len / hyp_len)
|
||||
|
||||
|
||||
class SmoothingFunction:
|
||||
"""
|
||||
This is an implementation of the smoothing techniques
|
||||
for segment-level BLEU scores that was presented in
|
||||
Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
|
||||
Smoothing Techniques for Sentence-Level BLEU. In WMT14.
|
||||
http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
|
||||
"""
|
||||
|
||||
def __init__(self, epsilon=0.1, alpha=5, k=5):
|
||||
"""
|
||||
This will initialize the parameters required for the various smoothing
|
||||
techniques, the default values are set to the numbers used in the
|
||||
experiments from Chen and Cherry (2014).
|
||||
|
||||
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
|
||||
... 'that', 'the', 'military', 'always', 'obeys', 'the',
|
||||
... 'commands', 'of', 'the', 'party']
|
||||
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
|
||||
... 'that', 'the', 'military', 'will', 'forever', 'heed',
|
||||
... 'Party', 'commands']
|
||||
|
||||
>>> chencherry = SmoothingFunction()
|
||||
>>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
|
||||
0.4118...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
|
||||
0.4118...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
|
||||
0.4118...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
|
||||
0.4452...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
|
||||
0.4118...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
|
||||
0.4118...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
|
||||
0.4905...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
|
||||
0.4135...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
|
||||
0.4905...
|
||||
|
||||
:param epsilon: the epsilon value use in method 1
|
||||
:type epsilon: float
|
||||
:param alpha: the alpha value use in method 6
|
||||
:type alpha: int
|
||||
:param k: the k value use in method 4
|
||||
:type k: int
|
||||
"""
|
||||
self.epsilon = epsilon
|
||||
self.alpha = alpha
|
||||
self.k = k
|
||||
|
||||
def method0(self, p_n, *args, **kwargs):
|
||||
"""
|
||||
No smoothing.
|
||||
"""
|
||||
p_n_new = []
|
||||
for i, p_i in enumerate(p_n):
|
||||
if p_i.numerator != 0:
|
||||
p_n_new.append(p_i)
|
||||
else:
|
||||
_msg = str(
|
||||
"\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
|
||||
"Therefore the BLEU score evaluates to 0, independently of\n"
|
||||
"how many N-gram overlaps of lower order it contains.\n"
|
||||
"Consider using lower n-gram order or use "
|
||||
"SmoothingFunction()"
|
||||
).format(i + 1)
|
||||
warnings.warn(_msg)
|
||||
# When numerator==0 where denonminator==0 or !=0, the result
|
||||
# for the precision score should be equal to 0 or undefined.
|
||||
# Due to BLEU geometric mean computation in logarithm space,
|
||||
# we we need to take the return sys.float_info.min such that
|
||||
# math.log(sys.float_info.min) returns a 0 precision score.
|
||||
p_n_new.append(sys.float_info.min)
|
||||
return p_n_new
|
||||
|
||||
def method1(self, p_n, *args, **kwargs):
|
||||
"""
|
||||
Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
|
||||
"""
|
||||
return [
|
||||
(
|
||||
(p_i.numerator + self.epsilon) / p_i.denominator
|
||||
if p_i.numerator == 0
|
||||
else p_i
|
||||
)
|
||||
for p_i in p_n
|
||||
]
|
||||
|
||||
def method2(self, p_n, *args, **kwargs):
|
||||
"""
|
||||
Smoothing method 2: Add 1 to both numerator and denominator from
|
||||
Chin-Yew Lin and Franz Josef Och (2004) ORANGE: a Method for
|
||||
Evaluating Automatic Evaluation Metrics for Machine Translation.
|
||||
In COLING 2004.
|
||||
"""
|
||||
return [
|
||||
(
|
||||
Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False)
|
||||
if i != 0
|
||||
else p_n[0]
|
||||
)
|
||||
for i in range(len(p_n))
|
||||
]
|
||||
|
||||
def method3(self, p_n, *args, **kwargs):
|
||||
"""
|
||||
Smoothing method 3: NIST geometric sequence smoothing
|
||||
The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
|
||||
precision score whose matching n-gram count is null.
|
||||
k is 1 for the first 'n' value for which the n-gram match count is null/
|
||||
|
||||
For example, if the text contains:
|
||||
|
||||
- one 2-gram match
|
||||
- and (consequently) two 1-gram matches
|
||||
|
||||
the n-gram count for each individual precision score would be:
|
||||
|
||||
- n=1 => prec_count = 2 (two unigrams)
|
||||
- n=2 => prec_count = 1 (one bigram)
|
||||
- n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
|
||||
- n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
|
||||
"""
|
||||
incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
|
||||
for i, p_i in enumerate(p_n):
|
||||
if p_i.numerator == 0:
|
||||
p_n[i] = 1 / (2**incvnt * p_i.denominator)
|
||||
incvnt += 1
|
||||
return p_n
|
||||
|
||||
def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||||
"""
|
||||
Smoothing method 4:
|
||||
Shorter translations may have inflated precision values due to having
|
||||
smaller denominators; therefore, we give them proportionally
|
||||
smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
|
||||
suggests dividing by 1/ln(len(T)), where T is the length of the translation.
|
||||
"""
|
||||
incvnt = 1
|
||||
hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||||
for i, p_i in enumerate(p_n):
|
||||
if p_i.numerator == 0 and hyp_len > 1:
|
||||
# incvnt = i + 1 * self.k / math.log(
|
||||
# hyp_len
|
||||
# ) # Note that this K is different from the K from NIST.
|
||||
# p_n[i] = incvnt / p_i.denominator\
|
||||
numerator = 1 / (2**incvnt * self.k / math.log(hyp_len))
|
||||
p_n[i] = numerator / p_i.denominator
|
||||
incvnt += 1
|
||||
return p_n
|
||||
|
||||
def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||||
"""
|
||||
Smoothing method 5:
|
||||
The matched counts for similar values of n should be similar. To a
|
||||
calculate the n-gram matched count, it averages the n−1, n and n+1 gram
|
||||
matched counts.
|
||||
"""
|
||||
hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||||
m = {}
|
||||
# Requires an precision value for an addition ngram order.
|
||||
p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
|
||||
m[-1] = p_n[0] + 1
|
||||
for i, p_i in enumerate(p_n):
|
||||
p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
|
||||
m[i] = p_n[i]
|
||||
return p_n
|
||||
|
||||
def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||||
"""
|
||||
Smoothing method 6:
|
||||
Interpolates the maximum likelihood estimate of the precision *p_n* with
|
||||
a prior estimate *pi0*. The prior is estimated by assuming that the ratio
|
||||
between pn and pn−1 will be the same as that between pn−1 and pn−2; from
|
||||
Gao and He (2013) Training MRF-Based Phrase Translation Models using
|
||||
Gradient Ascent. In NAACL.
|
||||
"""
|
||||
hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||||
# This smoothing only works when p_1 and p_2 is non-zero.
|
||||
# Raise an error with an appropriate message when the input is too short
|
||||
# to use this smoothing technique.
|
||||
assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
|
||||
for i, p_i in enumerate(p_n):
|
||||
if i in [0, 1]: # Skips the first 2 orders of ngrams.
|
||||
continue
|
||||
else:
|
||||
pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
|
||||
# No. of ngrams in translation that matches the reference.
|
||||
m = p_i.numerator
|
||||
# No. of ngrams in translation.
|
||||
l = sum(1 for _ in ngrams(hypothesis, i + 1))
|
||||
# Calculates the interpolated precision.
|
||||
p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
|
||||
return p_n
|
||||
|
||||
def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||||
"""
|
||||
Smoothing method 7:
|
||||
Interpolates methods 4 and 5.
|
||||
"""
|
||||
hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||||
p_n = self.method4(p_n, references, hypothesis, hyp_len)
|
||||
p_n = self.method5(p_n, references, hypothesis, hyp_len)
|
||||
return p_n
|
||||
221
backend/venv/Lib/site-packages/nltk/translate/chrf_score.py
Normal file
221
backend/venv/Lib/site-packages/nltk/translate/chrf_score.py
Normal file
@@ -0,0 +1,221 @@
|
||||
# Natural Language Toolkit: ChrF score
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Maja Popovic
|
||||
# Contributors: Liling Tan, Aleš Tamchyna (Memsource)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
""" ChrF score implementation """
|
||||
import re
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
from nltk.util import ngrams
|
||||
|
||||
|
||||
def sentence_chrf(
|
||||
reference, hypothesis, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
|
||||
):
|
||||
"""
|
||||
Calculates the sentence level CHRF (Character n-gram F-score) described in
|
||||
- Maja Popovic. 2015. CHRF: Character n-gram F-score for Automatic MT Evaluation.
|
||||
In Proceedings of the 10th Workshop on Machine Translation.
|
||||
https://www.statmt.org/wmt15/pdf/WMT49.pdf
|
||||
- Maja Popovic. 2016. CHRF Deconstructed: β Parameters and n-gram Weights.
|
||||
In Proceedings of the 1st Conference on Machine Translation.
|
||||
https://www.statmt.org/wmt16/pdf/W16-2341.pdf
|
||||
|
||||
This implementation of CHRF only supports a single reference at the moment.
|
||||
|
||||
For details not reported in the paper, consult Maja Popovic's original
|
||||
implementation: https://github.com/m-popovic/chrF
|
||||
|
||||
The code should output results equivalent to running CHRF++ with the
|
||||
following options: -nw 0 -b 3
|
||||
|
||||
An example from the original BLEU paper
|
||||
https://www.aclweb.org/anthology/P02-1040.pdf
|
||||
|
||||
>>> ref1 = str('It is a guide to action that ensures that the military '
|
||||
... 'will forever heed Party commands').split()
|
||||
>>> hyp1 = str('It is a guide to action which ensures that the military '
|
||||
... 'always obeys the commands of the party').split()
|
||||
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
|
||||
... 'guidebook that party direct').split()
|
||||
>>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
|
||||
0.6349...
|
||||
>>> sentence_chrf(ref1, hyp2) # doctest: +ELLIPSIS
|
||||
0.3330...
|
||||
|
||||
The infamous "the the the ... " example
|
||||
|
||||
>>> ref = 'the cat is on the mat'.split()
|
||||
>>> hyp = 'the the the the the the the'.split()
|
||||
>>> sentence_chrf(ref, hyp) # doctest: +ELLIPSIS
|
||||
0.1468...
|
||||
|
||||
An example to show that this function allows users to use strings instead of
|
||||
tokens, i.e. list(str) as inputs.
|
||||
|
||||
>>> ref1 = str('It is a guide to action that ensures that the military '
|
||||
... 'will forever heed Party commands')
|
||||
>>> hyp1 = str('It is a guide to action which ensures that the military '
|
||||
... 'always obeys the commands of the party')
|
||||
>>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
|
||||
0.6349...
|
||||
>>> type(ref1) == type(hyp1) == str
|
||||
True
|
||||
>>> sentence_chrf(ref1.split(), hyp1.split()) # doctest: +ELLIPSIS
|
||||
0.6349...
|
||||
|
||||
To skip the unigrams and only use 2- to 3-grams:
|
||||
|
||||
>>> sentence_chrf(ref1, hyp1, min_len=2, max_len=3) # doctest: +ELLIPSIS
|
||||
0.6617...
|
||||
|
||||
:param references: reference sentence
|
||||
:type references: list(str) / str
|
||||
:param hypothesis: a hypothesis sentence
|
||||
:type hypothesis: list(str) / str
|
||||
:param min_len: The minimum order of n-gram this function should extract.
|
||||
:type min_len: int
|
||||
:param max_len: The maximum order of n-gram this function should extract.
|
||||
:type max_len: int
|
||||
:param beta: the parameter to assign more importance to recall over precision
|
||||
:type beta: float
|
||||
:param ignore_whitespace: ignore whitespace characters in scoring
|
||||
:type ignore_whitespace: bool
|
||||
:return: the sentence level CHRF score.
|
||||
:rtype: float
|
||||
"""
|
||||
return corpus_chrf(
|
||||
[reference],
|
||||
[hypothesis],
|
||||
min_len,
|
||||
max_len,
|
||||
beta=beta,
|
||||
ignore_whitespace=ignore_whitespace,
|
||||
)
|
||||
|
||||
|
||||
def _preprocess(sent, ignore_whitespace):
|
||||
if type(sent) != str:
|
||||
# turn list of tokens into a string
|
||||
sent = " ".join(sent)
|
||||
|
||||
if ignore_whitespace:
|
||||
sent = re.sub(r"\s+", "", sent)
|
||||
return sent
|
||||
|
||||
|
||||
def chrf_precision_recall_fscore_support(
|
||||
reference, hypothesis, n, beta=3.0, epsilon=1e-16
|
||||
):
|
||||
"""
|
||||
This function computes the precision, recall and fscore from the ngram
|
||||
overlaps. It returns the `support` which is the true positive score.
|
||||
|
||||
By underspecifying the input type, the function will be agnostic as to how
|
||||
it computes the ngrams and simply take the whichever element in the list;
|
||||
it could be either token or character.
|
||||
|
||||
:param reference: The reference sentence.
|
||||
:type reference: list
|
||||
:param hypothesis: The hypothesis sentence.
|
||||
:type hypothesis: list
|
||||
:param n: Extract up to the n-th order ngrams
|
||||
:type n: int
|
||||
:param beta: The parameter to assign more importance to recall over precision.
|
||||
:type beta: float
|
||||
:param epsilon: The fallback value if the hypothesis or reference is empty.
|
||||
:type epsilon: float
|
||||
:return: Returns the precision, recall and f-score and support (true positive).
|
||||
:rtype: tuple(float)
|
||||
"""
|
||||
ref_ngrams = Counter(ngrams(reference, n))
|
||||
hyp_ngrams = Counter(ngrams(hypothesis, n))
|
||||
|
||||
# calculate the number of ngram matches
|
||||
overlap_ngrams = ref_ngrams & hyp_ngrams
|
||||
tp = sum(overlap_ngrams.values()) # True positives.
|
||||
tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
|
||||
tpfn = sum(ref_ngrams.values()) # True positives + False negatives.
|
||||
|
||||
try:
|
||||
prec = tp / tpfp # precision
|
||||
rec = tp / tpfn # recall
|
||||
factor = beta**2
|
||||
fscore = (1 + factor) * (prec * rec) / (factor * prec + rec)
|
||||
except ZeroDivisionError:
|
||||
prec = rec = fscore = epsilon
|
||||
return prec, rec, fscore, tp
|
||||
|
||||
|
||||
def corpus_chrf(
|
||||
references, hypotheses, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
|
||||
):
|
||||
"""
|
||||
Calculates the corpus level CHRF (Character n-gram F-score), it is the
|
||||
macro-averaged value of the sentence/segment level CHRF score.
|
||||
|
||||
This implementation of CHRF only supports a single reference at the moment.
|
||||
|
||||
>>> ref1 = str('It is a guide to action that ensures that the military '
|
||||
... 'will forever heed Party commands').split()
|
||||
>>> ref2 = str('It is the guiding principle which guarantees the military '
|
||||
... 'forces always being under the command of the Party').split()
|
||||
>>>
|
||||
>>> hyp1 = str('It is a guide to action which ensures that the military '
|
||||
... 'always obeys the commands of the party').split()
|
||||
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
|
||||
... 'guidebook that party direct')
|
||||
>>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS
|
||||
0.3910...
|
||||
|
||||
:param references: a corpus of list of reference sentences, w.r.t. hypotheses
|
||||
:type references: list(list(str))
|
||||
:param hypotheses: a list of hypothesis sentences
|
||||
:type hypotheses: list(list(str))
|
||||
:param min_len: The minimum order of n-gram this function should extract.
|
||||
:type min_len: int
|
||||
:param max_len: The maximum order of n-gram this function should extract.
|
||||
:type max_len: int
|
||||
:param beta: the parameter to assign more importance to recall over precision
|
||||
:type beta: float
|
||||
:param ignore_whitespace: ignore whitespace characters in scoring
|
||||
:type ignore_whitespace: bool
|
||||
:return: the sentence level CHRF score.
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
assert len(references) == len(
|
||||
hypotheses
|
||||
), "The number of hypotheses and their references should be the same"
|
||||
num_sents = len(hypotheses)
|
||||
|
||||
# Keep f-scores for each n-gram order separate
|
||||
ngram_fscores = defaultdict(list)
|
||||
|
||||
# Iterate through each hypothesis and their corresponding references.
|
||||
for reference, hypothesis in zip(references, hypotheses):
|
||||
# preprocess both reference and hypothesis
|
||||
reference = _preprocess(reference, ignore_whitespace)
|
||||
hypothesis = _preprocess(hypothesis, ignore_whitespace)
|
||||
|
||||
# Calculate f-scores for each sentence and for each n-gram order
|
||||
# separately.
|
||||
for n in range(min_len, max_len + 1):
|
||||
# Compute the precision, recall, fscore and support.
|
||||
prec, rec, fscore, tp = chrf_precision_recall_fscore_support(
|
||||
reference, hypothesis, n, beta=beta
|
||||
)
|
||||
ngram_fscores[n].append(fscore)
|
||||
|
||||
# how many n-gram sizes
|
||||
num_ngram_sizes = len(ngram_fscores)
|
||||
|
||||
# sum of f-scores over all sentences for each n-gram order
|
||||
total_scores = [sum(fscores) for n, fscores in ngram_fscores.items()]
|
||||
|
||||
# macro-average over n-gram orders and over all sentences
|
||||
return (sum(total_scores) / num_ngram_sizes) / num_sents
|
||||
263
backend/venv/Lib/site-packages/nltk/translate/gale_church.py
Normal file
263
backend/venv/Lib/site-packages/nltk/translate/gale_church.py
Normal file
@@ -0,0 +1,263 @@
|
||||
# Natural Language Toolkit: Gale-Church Aligner
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Torsten Marek <marek@ifi.uzh.ch>
|
||||
# Contributor: Cassidy Laidlaw, Liling Tan
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
|
||||
A port of the Gale-Church Aligner.
|
||||
|
||||
Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora.
|
||||
https://aclweb.org/anthology/J93-1004.pdf
|
||||
|
||||
"""
|
||||
|
||||
import math
|
||||
|
||||
try:
|
||||
from norm import logsf as norm_logsf
|
||||
from scipy.stats import norm
|
||||
except ImportError:
|
||||
|
||||
def erfcc(x):
|
||||
"""Complementary error function."""
|
||||
z = abs(x)
|
||||
t = 1 / (1 + 0.5 * z)
|
||||
r = t * math.exp(
|
||||
-z * z
|
||||
- 1.26551223
|
||||
+ t
|
||||
* (
|
||||
1.00002368
|
||||
+ t
|
||||
* (
|
||||
0.37409196
|
||||
+ t
|
||||
* (
|
||||
0.09678418
|
||||
+ t
|
||||
* (
|
||||
-0.18628806
|
||||
+ t
|
||||
* (
|
||||
0.27886807
|
||||
+ t
|
||||
* (
|
||||
-1.13520398
|
||||
+ t
|
||||
* (1.48851587 + t * (-0.82215223 + t * 0.17087277))
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
if x >= 0.0:
|
||||
return r
|
||||
else:
|
||||
return 2.0 - r
|
||||
|
||||
def norm_cdf(x):
|
||||
"""Return the area under the normal distribution from M{-∞..x}."""
|
||||
return 1 - 0.5 * erfcc(x / math.sqrt(2))
|
||||
|
||||
def norm_logsf(x):
|
||||
try:
|
||||
return math.log(1 - norm_cdf(x))
|
||||
except ValueError:
|
||||
return float("-inf")
|
||||
|
||||
|
||||
LOG2 = math.log(2)
|
||||
|
||||
|
||||
class LanguageIndependent:
|
||||
# These are the language-independent probabilities and parameters
|
||||
# given in Gale & Church
|
||||
|
||||
# for the computation, l_1 is always the language with less characters
|
||||
PRIORS = {
|
||||
(1, 0): 0.0099,
|
||||
(0, 1): 0.0099,
|
||||
(1, 1): 0.89,
|
||||
(2, 1): 0.089,
|
||||
(1, 2): 0.089,
|
||||
(2, 2): 0.011,
|
||||
}
|
||||
|
||||
AVERAGE_CHARACTERS = 1
|
||||
VARIANCE_CHARACTERS = 6.8
|
||||
|
||||
|
||||
def trace(backlinks, source_sents_lens, target_sents_lens):
|
||||
"""
|
||||
Traverse the alignment cost from the tracebacks and retrieves
|
||||
appropriate sentence pairs.
|
||||
|
||||
:param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)
|
||||
:type backlinks: dict
|
||||
:param source_sents_lens: A list of target sentences' lengths
|
||||
:type source_sents_lens: list(int)
|
||||
:param target_sents_lens: A list of target sentences' lengths
|
||||
:type target_sents_lens: list(int)
|
||||
"""
|
||||
links = []
|
||||
position = (len(source_sents_lens), len(target_sents_lens))
|
||||
while position != (0, 0) and all(p >= 0 for p in position):
|
||||
try:
|
||||
s, t = backlinks[position]
|
||||
except TypeError:
|
||||
position = (position[0] - 1, position[1] - 1)
|
||||
continue
|
||||
for i in range(s):
|
||||
for j in range(t):
|
||||
links.append((position[0] - i - 1, position[1] - j - 1))
|
||||
position = (position[0] - s, position[1] - t)
|
||||
|
||||
return links[::-1]
|
||||
|
||||
|
||||
def align_log_prob(i, j, source_sents, target_sents, alignment, params):
|
||||
"""Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]}
|
||||
being aligned with a specific C{alignment}.
|
||||
|
||||
@param i: The offset of the source sentence.
|
||||
@param j: The offset of the target sentence.
|
||||
@param source_sents: The list of source sentence lengths.
|
||||
@param target_sents: The list of target sentence lengths.
|
||||
@param alignment: The alignment type, a tuple of two integers.
|
||||
@param params: The sentence alignment parameters.
|
||||
|
||||
@returns: The log probability of a specific alignment between the two sentences, given the parameters.
|
||||
"""
|
||||
l_s = sum(source_sents[i - offset - 1] for offset in range(alignment[0]))
|
||||
l_t = sum(target_sents[j - offset - 1] for offset in range(alignment[1]))
|
||||
try:
|
||||
# actually, the paper says l_s * params.VARIANCE_CHARACTERS, this is based on the C
|
||||
# reference implementation. With l_s in the denominator, insertions are impossible.
|
||||
m = (l_s + l_t / params.AVERAGE_CHARACTERS) / 2
|
||||
delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt(
|
||||
m * params.VARIANCE_CHARACTERS
|
||||
)
|
||||
except ZeroDivisionError:
|
||||
return float("-inf")
|
||||
|
||||
return -(LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment]))
|
||||
|
||||
|
||||
def align_blocks(source_sents_lens, target_sents_lens, params=LanguageIndependent):
|
||||
"""Return the sentence alignment of two text blocks (usually paragraphs).
|
||||
|
||||
>>> align_blocks([5,5,5], [7,7,7])
|
||||
[(0, 0), (1, 1), (2, 2)]
|
||||
>>> align_blocks([10,5,5], [12,20])
|
||||
[(0, 0), (1, 1), (2, 1)]
|
||||
>>> align_blocks([12,20], [10,5,5])
|
||||
[(0, 0), (1, 1), (1, 2)]
|
||||
>>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])
|
||||
[(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]
|
||||
|
||||
@param source_sents_lens: The list of source sentence lengths.
|
||||
@param target_sents_lens: The list of target sentence lengths.
|
||||
@param params: the sentence alignment parameters.
|
||||
@return: The sentence alignments, a list of index pairs.
|
||||
"""
|
||||
|
||||
alignment_types = list(params.PRIORS.keys())
|
||||
|
||||
# there are always three rows in the history (with the last of them being filled)
|
||||
D = [[]]
|
||||
|
||||
backlinks = {}
|
||||
|
||||
for i in range(len(source_sents_lens) + 1):
|
||||
for j in range(len(target_sents_lens) + 1):
|
||||
min_dist = float("inf")
|
||||
min_align = None
|
||||
for a in alignment_types:
|
||||
prev_i = -1 - a[0]
|
||||
prev_j = j - a[1]
|
||||
if prev_i < -len(D) or prev_j < 0:
|
||||
continue
|
||||
p = D[prev_i][prev_j] + align_log_prob(
|
||||
i, j, source_sents_lens, target_sents_lens, a, params
|
||||
)
|
||||
if p < min_dist:
|
||||
min_dist = p
|
||||
min_align = a
|
||||
|
||||
if min_dist == float("inf"):
|
||||
min_dist = 0
|
||||
|
||||
backlinks[(i, j)] = min_align
|
||||
D[-1].append(min_dist)
|
||||
|
||||
if len(D) > 2:
|
||||
D.pop(0)
|
||||
D.append([])
|
||||
|
||||
return trace(backlinks, source_sents_lens, target_sents_lens)
|
||||
|
||||
|
||||
def align_texts(source_blocks, target_blocks, params=LanguageIndependent):
|
||||
"""Creates the sentence alignment of two texts.
|
||||
|
||||
Texts can consist of several blocks. Block boundaries cannot be crossed by sentence
|
||||
alignment links.
|
||||
|
||||
Each block consists of a list that contains the lengths (in characters) of the sentences
|
||||
in this block.
|
||||
|
||||
@param source_blocks: The list of blocks in the source text.
|
||||
@param target_blocks: The list of blocks in the target text.
|
||||
@param params: the sentence alignment parameters.
|
||||
|
||||
@returns: A list of sentence alignment lists
|
||||
"""
|
||||
if len(source_blocks) != len(target_blocks):
|
||||
raise ValueError(
|
||||
"Source and target texts do not have the same number of blocks."
|
||||
)
|
||||
|
||||
return [
|
||||
align_blocks(source_block, target_block, params)
|
||||
for source_block, target_block in zip(source_blocks, target_blocks)
|
||||
]
|
||||
|
||||
|
||||
# File I/O functions; may belong in a corpus reader
|
||||
|
||||
|
||||
def split_at(it, split_value):
|
||||
"""Splits an iterator C{it} at values of C{split_value}.
|
||||
|
||||
Each instance of C{split_value} is swallowed. The iterator produces
|
||||
subiterators which need to be consumed fully before the next subiterator
|
||||
can be used.
|
||||
"""
|
||||
|
||||
def _chunk_iterator(first):
|
||||
v = first
|
||||
while v != split_value:
|
||||
yield v
|
||||
v = it.next()
|
||||
|
||||
while True:
|
||||
yield _chunk_iterator(it.next())
|
||||
|
||||
|
||||
def parse_token_stream(stream, soft_delimiter, hard_delimiter):
|
||||
"""Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens)
|
||||
and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.
|
||||
"""
|
||||
return [
|
||||
[
|
||||
sum(len(token) for token in sentence_it)
|
||||
for sentence_it in split_at(block_it, soft_delimiter)
|
||||
]
|
||||
for block_it in split_at(stream, hard_delimiter)
|
||||
]
|
||||
138
backend/venv/Lib/site-packages/nltk/translate/gdfa.py
Normal file
138
backend/venv/Lib/site-packages/nltk/translate/gdfa.py
Normal file
@@ -0,0 +1,138 @@
|
||||
# Natural Language Toolkit: GDFA word alignment symmetrization
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Liling Tan
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def grow_diag_final_and(srclen, trglen, e2f, f2e):
|
||||
"""
|
||||
This module symmetrisatizes the source-to-target and target-to-source
|
||||
word alignment output and produces, aka. GDFA algorithm (Koehn, 2005).
|
||||
|
||||
Step 1: Find the intersection of the bidirectional alignment.
|
||||
|
||||
Step 2: Search for additional neighbor alignment points to be added, given
|
||||
these criteria: (i) neighbor alignments points are not in the
|
||||
intersection and (ii) neighbor alignments are in the union.
|
||||
|
||||
Step 3: Add all other alignment points that are not in the intersection, not in
|
||||
the neighboring alignments that met the criteria but in the original
|
||||
forward/backward alignment outputs.
|
||||
|
||||
>>> forw = ('0-0 2-1 9-2 21-3 10-4 7-5 11-6 9-7 12-8 1-9 3-10 '
|
||||
... '4-11 17-12 17-13 25-14 13-15 24-16 11-17 28-18')
|
||||
>>> back = ('0-0 1-9 2-9 3-10 4-11 5-12 6-6 7-5 8-6 9-7 10-4 '
|
||||
... '11-6 12-8 13-12 15-12 17-13 18-13 19-12 20-13 '
|
||||
... '21-3 22-12 23-14 24-17 25-15 26-17 27-18 28-18')
|
||||
>>> srctext = ("この よう な ハロー 白色 わい 星 の L 関数 "
|
||||
... "は L と 共 に 不連続 に 増加 する こと が "
|
||||
... "期待 さ れる こと を 示し た 。")
|
||||
>>> trgtext = ("Therefore , we expect that the luminosity function "
|
||||
... "of such halo white dwarfs increases discontinuously "
|
||||
... "with the luminosity .")
|
||||
>>> srclen = len(srctext.split())
|
||||
>>> trglen = len(trgtext.split())
|
||||
>>>
|
||||
>>> gdfa = grow_diag_final_and(srclen, trglen, forw, back)
|
||||
>>> gdfa == sorted(set([(28, 18), (6, 6), (24, 17), (2, 1), (15, 12), (13, 12),
|
||||
... (2, 9), (3, 10), (26, 17), (25, 15), (8, 6), (9, 7), (20,
|
||||
... 13), (18, 13), (0, 0), (10, 4), (13, 15), (23, 14), (7, 5),
|
||||
... (25, 14), (1, 9), (17, 13), (4, 11), (11, 17), (9, 2), (22,
|
||||
... 12), (27, 18), (24, 16), (21, 3), (19, 12), (17, 12), (5,
|
||||
... 12), (11, 6), (12, 8)]))
|
||||
True
|
||||
|
||||
References:
|
||||
Koehn, P., A. Axelrod, A. Birch, C. Callison, M. Osborne, and D. Talbot.
|
||||
2005. Edinburgh System Description for the 2005 IWSLT Speech
|
||||
Translation Evaluation. In MT Eval Workshop.
|
||||
|
||||
:type srclen: int
|
||||
:param srclen: the number of tokens in the source language
|
||||
:type trglen: int
|
||||
:param trglen: the number of tokens in the target language
|
||||
:type e2f: str
|
||||
:param e2f: the forward word alignment outputs from source-to-target
|
||||
language (in pharaoh output format)
|
||||
:type f2e: str
|
||||
:param f2e: the backward word alignment outputs from target-to-source
|
||||
language (in pharaoh output format)
|
||||
:rtype: set(tuple(int))
|
||||
:return: the symmetrized alignment points from the GDFA algorithm
|
||||
"""
|
||||
|
||||
# Converts pharaoh text format into list of tuples.
|
||||
e2f = [tuple(map(int, a.split("-"))) for a in e2f.split()]
|
||||
f2e = [tuple(map(int, a.split("-"))) for a in f2e.split()]
|
||||
|
||||
neighbors = [(-1, 0), (0, -1), (1, 0), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1)]
|
||||
alignment = set(e2f).intersection(set(f2e)) # Find the intersection.
|
||||
union = set(e2f).union(set(f2e))
|
||||
|
||||
# *aligned* is used to check if neighbors are aligned in grow_diag()
|
||||
aligned = defaultdict(set)
|
||||
for i, j in alignment:
|
||||
aligned["e"].add(i)
|
||||
aligned["f"].add(j)
|
||||
|
||||
def grow_diag():
|
||||
"""
|
||||
Search for the neighbor points and them to the intersected alignment
|
||||
points if criteria are met.
|
||||
"""
|
||||
prev_len = len(alignment) - 1
|
||||
# iterate until no new points added
|
||||
while prev_len < len(alignment):
|
||||
no_new_points = True
|
||||
# for english word e = 0 ... en
|
||||
for e in range(srclen):
|
||||
# for foreign word f = 0 ... fn
|
||||
for f in range(trglen):
|
||||
# if ( e aligned with f)
|
||||
if (e, f) in alignment:
|
||||
# for each neighboring point (e-new, f-new)
|
||||
for neighbor in neighbors:
|
||||
neighbor = tuple(i + j for i, j in zip((e, f), neighbor))
|
||||
e_new, f_new = neighbor
|
||||
# if ( ( e-new not aligned and f-new not aligned)
|
||||
# and (e-new, f-new in union(e2f, f2e) )
|
||||
if (
|
||||
e_new not in aligned and f_new not in aligned
|
||||
) and neighbor in union:
|
||||
alignment.add(neighbor)
|
||||
aligned["e"].add(e_new)
|
||||
aligned["f"].add(f_new)
|
||||
prev_len += 1
|
||||
no_new_points = False
|
||||
# iterate until no new points added
|
||||
if no_new_points:
|
||||
break
|
||||
|
||||
def final_and(a):
|
||||
"""
|
||||
Adds remaining points that are not in the intersection, not in the
|
||||
neighboring alignments but in the original *e2f* and *f2e* alignments
|
||||
"""
|
||||
# for english word e = 0 ... en
|
||||
for e_new in range(srclen):
|
||||
# for foreign word f = 0 ... fn
|
||||
for f_new in range(trglen):
|
||||
# if ( ( e-new not aligned and f-new not aligned)
|
||||
# and (e-new, f-new in union(e2f, f2e) )
|
||||
if (
|
||||
e_new not in aligned
|
||||
and f_new not in aligned
|
||||
and (e_new, f_new) in union
|
||||
):
|
||||
alignment.add((e_new, f_new))
|
||||
aligned["e"].add(e_new)
|
||||
aligned["f"].add(f_new)
|
||||
|
||||
grow_diag()
|
||||
final_and(e2f)
|
||||
final_and(f2e)
|
||||
return sorted(alignment)
|
||||
190
backend/venv/Lib/site-packages/nltk/translate/gleu_score.py
Normal file
190
backend/venv/Lib/site-packages/nltk/translate/gleu_score.py
Normal file
@@ -0,0 +1,190 @@
|
||||
# Natural Language Toolkit: GLEU Score
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors:
|
||||
# Contributors: Mike Schuster, Michael Wayne Goodman, Liling Tan
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
""" GLEU score implementation. """
|
||||
|
||||
from collections import Counter
|
||||
|
||||
from nltk.util import everygrams, ngrams
|
||||
|
||||
|
||||
def sentence_gleu(references, hypothesis, min_len=1, max_len=4):
|
||||
"""
|
||||
Calculates the sentence level GLEU (Google-BLEU) score described in
|
||||
|
||||
Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi,
|
||||
Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey,
|
||||
Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser,
|
||||
Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens,
|
||||
George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith,
|
||||
Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes,
|
||||
Jeffrey Dean. (2016) Google’s Neural Machine Translation System:
|
||||
Bridging the Gap between Human and Machine Translation.
|
||||
eprint arXiv:1609.08144. https://arxiv.org/pdf/1609.08144v2.pdf
|
||||
Retrieved on 27 Oct 2016.
|
||||
|
||||
From Wu et al. (2016):
|
||||
"The BLEU score has some undesirable properties when used for single
|
||||
sentences, as it was designed to be a corpus measure. We therefore
|
||||
use a slightly different score for our RL experiments which we call
|
||||
the 'GLEU score'. For the GLEU score, we record all sub-sequences of
|
||||
1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then
|
||||
compute a recall, which is the ratio of the number of matching n-grams
|
||||
to the number of total n-grams in the target (ground truth) sequence,
|
||||
and a precision, which is the ratio of the number of matching n-grams
|
||||
to the number of total n-grams in the generated output sequence. Then
|
||||
GLEU score is simply the minimum of recall and precision. This GLEU
|
||||
score's range is always between 0 (no matches) and 1 (all match) and
|
||||
it is symmetrical when switching output and target. According to
|
||||
our experiments, GLEU score correlates quite well with the BLEU
|
||||
metric on a corpus level but does not have its drawbacks for our per
|
||||
sentence reward objective."
|
||||
|
||||
Note: The initial implementation only allowed a single reference, but now
|
||||
a list of references is required (which is consistent with
|
||||
bleu_score.sentence_bleu()).
|
||||
|
||||
The infamous "the the the ... " example
|
||||
|
||||
>>> ref = 'the cat is on the mat'.split()
|
||||
>>> hyp = 'the the the the the the the'.split()
|
||||
>>> sentence_gleu([ref], hyp) # doctest: +ELLIPSIS
|
||||
0.0909...
|
||||
|
||||
An example to evaluate normal machine translation outputs
|
||||
|
||||
>>> ref1 = str('It is a guide to action that ensures that the military '
|
||||
... 'will forever heed Party commands').split()
|
||||
>>> hyp1 = str('It is a guide to action which ensures that the military '
|
||||
... 'always obeys the commands of the party').split()
|
||||
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
|
||||
... 'guidebook that party direct').split()
|
||||
>>> sentence_gleu([ref1], hyp1) # doctest: +ELLIPSIS
|
||||
0.4393...
|
||||
>>> sentence_gleu([ref1], hyp2) # doctest: +ELLIPSIS
|
||||
0.1206...
|
||||
|
||||
:param references: a list of reference sentences
|
||||
:type references: list(list(str))
|
||||
:param hypothesis: a hypothesis sentence
|
||||
:type hypothesis: list(str)
|
||||
:param min_len: The minimum order of n-gram this function should extract.
|
||||
:type min_len: int
|
||||
:param max_len: The maximum order of n-gram this function should extract.
|
||||
:type max_len: int
|
||||
:return: the sentence level GLEU score.
|
||||
:rtype: float
|
||||
"""
|
||||
return corpus_gleu([references], [hypothesis], min_len=min_len, max_len=max_len)
|
||||
|
||||
|
||||
def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4):
|
||||
"""
|
||||
Calculate a single corpus-level GLEU score (aka. system-level GLEU) for all
|
||||
the hypotheses and their respective references.
|
||||
|
||||
Instead of averaging the sentence level GLEU scores (i.e. macro-average
|
||||
precision), Wu et al. (2016) sum up the matching tokens and the max of
|
||||
hypothesis and reference tokens for each sentence, then compute using the
|
||||
aggregate values.
|
||||
|
||||
From Mike Schuster (via email):
|
||||
"For the corpus, we just add up the two statistics n_match and
|
||||
n_all = max(n_all_output, n_all_target) for all sentences, then
|
||||
calculate gleu_score = n_match / n_all, so it is not just a mean of
|
||||
the sentence gleu scores (in our case, longer sentences count more,
|
||||
which I think makes sense as they are more difficult to translate)."
|
||||
|
||||
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||||
... 'ensures', 'that', 'the', 'military', 'always',
|
||||
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
>>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||||
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
||||
... 'heed', 'Party', 'commands']
|
||||
>>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||||
... 'guarantees', 'the', 'military', 'forces', 'always',
|
||||
... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
|
||||
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||||
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||||
... 'of', 'the', 'party']
|
||||
|
||||
>>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
|
||||
... 'interested', 'in', 'world', 'history']
|
||||
>>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
|
||||
... 'because', 'he', 'read', 'the', 'book']
|
||||
|
||||
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
|
||||
>>> hypotheses = [hyp1, hyp2]
|
||||
>>> corpus_gleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
|
||||
0.5673...
|
||||
|
||||
The example below show that corpus_gleu() is different from averaging
|
||||
sentence_gleu() for hypotheses
|
||||
|
||||
>>> score1 = sentence_gleu([ref1a], hyp1)
|
||||
>>> score2 = sentence_gleu([ref2a], hyp2)
|
||||
>>> (score1 + score2) / 2 # doctest: +ELLIPSIS
|
||||
0.6144...
|
||||
|
||||
:param list_of_references: a list of reference sentences, w.r.t. hypotheses
|
||||
:type list_of_references: list(list(list(str)))
|
||||
:param hypotheses: a list of hypothesis sentences
|
||||
:type hypotheses: list(list(str))
|
||||
:param min_len: The minimum order of n-gram this function should extract.
|
||||
:type min_len: int
|
||||
:param max_len: The maximum order of n-gram this function should extract.
|
||||
:type max_len: int
|
||||
:return: The corpus-level GLEU score.
|
||||
:rtype: float
|
||||
"""
|
||||
# sanity check
|
||||
assert len(list_of_references) == len(
|
||||
hypotheses
|
||||
), "The number of hypotheses and their reference(s) should be the same"
|
||||
|
||||
# sum matches and max-token-lengths over all sentences
|
||||
corpus_n_match = 0
|
||||
corpus_n_all = 0
|
||||
|
||||
for references, hypothesis in zip(list_of_references, hypotheses):
|
||||
hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
|
||||
tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
|
||||
|
||||
hyp_counts = []
|
||||
for reference in references:
|
||||
ref_ngrams = Counter(everygrams(reference, min_len, max_len))
|
||||
tpfn = sum(ref_ngrams.values()) # True positives + False negatives.
|
||||
|
||||
overlap_ngrams = ref_ngrams & hyp_ngrams
|
||||
tp = sum(overlap_ngrams.values()) # True positives.
|
||||
|
||||
# While GLEU is defined as the minimum of precision and
|
||||
# recall, we can reduce the number of division operations by one by
|
||||
# instead finding the maximum of the denominators for the precision
|
||||
# and recall formulae, since the numerators are the same:
|
||||
# precision = tp / tpfp
|
||||
# recall = tp / tpfn
|
||||
# gleu_score = min(precision, recall) == tp / max(tpfp, tpfn)
|
||||
n_all = max(tpfp, tpfn)
|
||||
|
||||
if n_all > 0:
|
||||
hyp_counts.append((tp, n_all))
|
||||
|
||||
# use the reference yielding the highest score
|
||||
if hyp_counts:
|
||||
n_match, n_all = max(hyp_counts, key=lambda hc: hc[0] / hc[1])
|
||||
corpus_n_match += n_match
|
||||
corpus_n_all += n_all
|
||||
|
||||
# corner case: empty corpus or empty references---don't divide by zero!
|
||||
if corpus_n_all == 0:
|
||||
gleu_score = 0.0
|
||||
else:
|
||||
gleu_score = corpus_n_match / corpus_n_all
|
||||
|
||||
return gleu_score
|
||||
251
backend/venv/Lib/site-packages/nltk/translate/ibm1.py
Normal file
251
backend/venv/Lib/site-packages/nltk/translate/ibm1.py
Normal file
@@ -0,0 +1,251 @@
|
||||
# Natural Language Toolkit: IBM Model 1
|
||||
#
|
||||
# Copyright (C) 2001-2013 NLTK Project
|
||||
# Author: Chin Yee Lee <c.lee32@student.unimelb.edu.au>
|
||||
# Hengfeng Li <hengfeng12345@gmail.com>
|
||||
# Ruxin Hou <r.hou@student.unimelb.edu.au>
|
||||
# Calvin Tanujaya Lim <c.tanujayalim@gmail.com>
|
||||
# Based on earlier version by:
|
||||
# Will Zhang <wilzzha@gmail.com>
|
||||
# Guan Gui <ggui@student.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Lexical translation model that ignores word order.
|
||||
|
||||
In IBM Model 1, word order is ignored for simplicity. As long as the
|
||||
word alignments are equivalent, it doesn't matter where the word occurs
|
||||
in the source or target sentence. Thus, the following three alignments
|
||||
are equally likely::
|
||||
|
||||
Source: je mange du jambon
|
||||
Target: i eat some ham
|
||||
Alignment: (0,0) (1,1) (2,2) (3,3)
|
||||
|
||||
Source: je mange du jambon
|
||||
Target: some ham eat i
|
||||
Alignment: (0,2) (1,3) (2,1) (3,1)
|
||||
|
||||
Source: du jambon je mange
|
||||
Target: eat i some ham
|
||||
Alignment: (0,3) (1,2) (2,0) (3,1)
|
||||
|
||||
Note that an alignment is represented here as
|
||||
(word_index_in_target, word_index_in_source).
|
||||
|
||||
The EM algorithm used in Model 1 is:
|
||||
|
||||
:E step: In the training data, count how many times a source language
|
||||
word is translated into a target language word, weighted by
|
||||
the prior probability of the translation.
|
||||
|
||||
:M step: Estimate the new probability of translation based on the
|
||||
counts from the Expectation step.
|
||||
|
||||
Notations
|
||||
---------
|
||||
|
||||
:i: Position in the source sentence
|
||||
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
|
||||
:j: Position in the target sentence
|
||||
Valid values are 1, 2, ..., length of target sentence
|
||||
:s: A word in the source language
|
||||
:t: A word in the target language
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
|
||||
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
|
||||
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
|
||||
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
|
||||
263-311.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.translate import AlignedSent, Alignment, IBMModel
|
||||
from nltk.translate.ibm_model import Counts
|
||||
|
||||
|
||||
class IBMModel1(IBMModel):
|
||||
"""
|
||||
Lexical translation model that ignores word order
|
||||
|
||||
>>> bitext = []
|
||||
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
|
||||
|
||||
>>> ibm1 = IBMModel1(bitext, 5)
|
||||
|
||||
>>> print(round(ibm1.translation_table['buch']['book'], 3))
|
||||
0.889
|
||||
>>> print(round(ibm1.translation_table['das']['book'], 3))
|
||||
0.062
|
||||
>>> print(round(ibm1.translation_table['buch'][None], 3))
|
||||
0.113
|
||||
>>> print(round(ibm1.translation_table['ja'][None], 3))
|
||||
0.073
|
||||
|
||||
>>> test_sentence = bitext[2]
|
||||
>>> test_sentence.words
|
||||
['das', 'buch', 'ist', 'ja', 'klein']
|
||||
>>> test_sentence.mots
|
||||
['the', 'book', 'is', 'small']
|
||||
>>> test_sentence.alignment
|
||||
Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
|
||||
"""
|
||||
Train on ``sentence_aligned_corpus`` and create a lexical
|
||||
translation model.
|
||||
|
||||
Translation direction is from ``AlignedSent.mots`` to
|
||||
``AlignedSent.words``.
|
||||
|
||||
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
|
||||
:type sentence_aligned_corpus: list(AlignedSent)
|
||||
|
||||
:param iterations: Number of iterations to run training algorithm
|
||||
:type iterations: int
|
||||
|
||||
:param probability_tables: Optional. Use this to pass in custom
|
||||
probability values. If not specified, probabilities will be
|
||||
set to a uniform distribution, or some other sensible value.
|
||||
If specified, the following entry must be present:
|
||||
``translation_table``.
|
||||
See ``IBMModel`` for the type and purpose of this table.
|
||||
:type probability_tables: dict[str]: object
|
||||
"""
|
||||
super().__init__(sentence_aligned_corpus)
|
||||
|
||||
if probability_tables is None:
|
||||
self.set_uniform_probabilities(sentence_aligned_corpus)
|
||||
else:
|
||||
# Set user-defined probabilities
|
||||
self.translation_table = probability_tables["translation_table"]
|
||||
|
||||
for n in range(0, iterations):
|
||||
self.train(sentence_aligned_corpus)
|
||||
|
||||
self.align_all(sentence_aligned_corpus)
|
||||
|
||||
def set_uniform_probabilities(self, sentence_aligned_corpus):
|
||||
initial_prob = 1 / len(self.trg_vocab)
|
||||
if initial_prob < IBMModel.MIN_PROB:
|
||||
warnings.warn(
|
||||
"Target language vocabulary is too large ("
|
||||
+ str(len(self.trg_vocab))
|
||||
+ " words). "
|
||||
"Results may be less accurate."
|
||||
)
|
||||
|
||||
for t in self.trg_vocab:
|
||||
self.translation_table[t] = defaultdict(lambda: initial_prob)
|
||||
|
||||
def train(self, parallel_corpus):
|
||||
counts = Counts()
|
||||
for aligned_sentence in parallel_corpus:
|
||||
trg_sentence = aligned_sentence.words
|
||||
src_sentence = [None] + aligned_sentence.mots
|
||||
|
||||
# E step (a): Compute normalization factors to weigh counts
|
||||
total_count = self.prob_all_alignments(src_sentence, trg_sentence)
|
||||
|
||||
# E step (b): Collect counts
|
||||
for t in trg_sentence:
|
||||
for s in src_sentence:
|
||||
count = self.prob_alignment_point(s, t)
|
||||
normalized_count = count / total_count[t]
|
||||
counts.t_given_s[t][s] += normalized_count
|
||||
counts.any_t_given_s[s] += normalized_count
|
||||
|
||||
# M step: Update probabilities with maximum likelihood estimate
|
||||
self.maximize_lexical_translation_probabilities(counts)
|
||||
|
||||
def prob_all_alignments(self, src_sentence, trg_sentence):
|
||||
"""
|
||||
Computes the probability of all possible word alignments,
|
||||
expressed as a marginal distribution over target words t
|
||||
|
||||
Each entry in the return value represents the contribution to
|
||||
the total alignment probability by the target word t.
|
||||
|
||||
To obtain probability(alignment | src_sentence, trg_sentence),
|
||||
simply sum the entries in the return value.
|
||||
|
||||
:return: Probability of t for all s in ``src_sentence``
|
||||
:rtype: dict(str): float
|
||||
"""
|
||||
alignment_prob_for_t = defaultdict(float)
|
||||
for t in trg_sentence:
|
||||
for s in src_sentence:
|
||||
alignment_prob_for_t[t] += self.prob_alignment_point(s, t)
|
||||
return alignment_prob_for_t
|
||||
|
||||
def prob_alignment_point(self, s, t):
|
||||
"""
|
||||
Probability that word ``t`` in the target sentence is aligned to
|
||||
word ``s`` in the source sentence
|
||||
"""
|
||||
return self.translation_table[t][s]
|
||||
|
||||
def prob_t_a_given_s(self, alignment_info):
|
||||
"""
|
||||
Probability of target sentence and an alignment given the
|
||||
source sentence
|
||||
"""
|
||||
prob = 1.0
|
||||
|
||||
for j, i in enumerate(alignment_info.alignment):
|
||||
if j == 0:
|
||||
continue # skip the dummy zeroeth element
|
||||
trg_word = alignment_info.trg_sentence[j]
|
||||
src_word = alignment_info.src_sentence[i]
|
||||
prob *= self.translation_table[trg_word][src_word]
|
||||
|
||||
return max(prob, IBMModel.MIN_PROB)
|
||||
|
||||
def align_all(self, parallel_corpus):
|
||||
for sentence_pair in parallel_corpus:
|
||||
self.align(sentence_pair)
|
||||
|
||||
def align(self, sentence_pair):
|
||||
"""
|
||||
Determines the best word alignment for one sentence pair from
|
||||
the corpus that the model was trained on.
|
||||
|
||||
The best alignment will be set in ``sentence_pair`` when the
|
||||
method returns. In contrast with the internal implementation of
|
||||
IBM models, the word indices in the ``Alignment`` are zero-
|
||||
indexed, not one-indexed.
|
||||
|
||||
:param sentence_pair: A sentence in the source language and its
|
||||
counterpart sentence in the target language
|
||||
:type sentence_pair: AlignedSent
|
||||
"""
|
||||
best_alignment = []
|
||||
|
||||
for j, trg_word in enumerate(sentence_pair.words):
|
||||
# Initialize trg_word to align with the NULL token
|
||||
best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB)
|
||||
best_alignment_point = None
|
||||
for i, src_word in enumerate(sentence_pair.mots):
|
||||
align_prob = self.translation_table[trg_word][src_word]
|
||||
if align_prob >= best_prob: # prefer newer word in case of tie
|
||||
best_prob = align_prob
|
||||
best_alignment_point = i
|
||||
|
||||
best_alignment.append((j, best_alignment_point))
|
||||
|
||||
sentence_pair.alignment = Alignment(best_alignment)
|
||||
319
backend/venv/Lib/site-packages/nltk/translate/ibm2.py
Normal file
319
backend/venv/Lib/site-packages/nltk/translate/ibm2.py
Normal file
@@ -0,0 +1,319 @@
|
||||
# Natural Language Toolkit: IBM Model 2
|
||||
#
|
||||
# Copyright (C) 2001-2013 NLTK Project
|
||||
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Lexical translation model that considers word order.
|
||||
|
||||
IBM Model 2 improves on Model 1 by accounting for word order.
|
||||
An alignment probability is introduced, a(i | j,l,m), which predicts
|
||||
a source word position, given its aligned target word's position.
|
||||
|
||||
The EM algorithm used in Model 2 is:
|
||||
|
||||
:E step: In the training data, collect counts, weighted by prior
|
||||
probabilities.
|
||||
|
||||
- (a) count how many times a source language word is translated
|
||||
into a target language word
|
||||
- (b) count how many times a particular position in the source
|
||||
sentence is aligned to a particular position in the target
|
||||
sentence
|
||||
|
||||
:M step: Estimate new probabilities based on the counts from the E step
|
||||
|
||||
Notations
|
||||
---------
|
||||
|
||||
:i: Position in the source sentence
|
||||
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
|
||||
:j: Position in the target sentence
|
||||
Valid values are 1, 2, ..., length of target sentence
|
||||
:l: Number of words in the source sentence, excluding NULL
|
||||
:m: Number of words in the target sentence
|
||||
:s: A word in the source language
|
||||
:t: A word in the target language
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
|
||||
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
|
||||
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
|
||||
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
|
||||
263-311.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel1
|
||||
from nltk.translate.ibm_model import Counts
|
||||
|
||||
|
||||
class IBMModel2(IBMModel):
|
||||
"""
|
||||
Lexical translation model that considers word order
|
||||
|
||||
>>> bitext = []
|
||||
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
|
||||
|
||||
>>> ibm2 = IBMModel2(bitext, 5)
|
||||
|
||||
>>> print(round(ibm2.translation_table['buch']['book'], 3))
|
||||
1.0
|
||||
>>> print(round(ibm2.translation_table['das']['book'], 3))
|
||||
0.0
|
||||
>>> print(round(ibm2.translation_table['buch'][None], 3))
|
||||
0.0
|
||||
>>> print(round(ibm2.translation_table['ja'][None], 3))
|
||||
0.0
|
||||
|
||||
>>> print(round(ibm2.alignment_table[1][1][2][2], 3))
|
||||
0.939
|
||||
>>> print(round(ibm2.alignment_table[1][2][2][2], 3))
|
||||
0.0
|
||||
>>> print(round(ibm2.alignment_table[2][2][4][5], 3))
|
||||
1.0
|
||||
|
||||
>>> test_sentence = bitext[2]
|
||||
>>> test_sentence.words
|
||||
['das', 'buch', 'ist', 'ja', 'klein']
|
||||
>>> test_sentence.mots
|
||||
['the', 'book', 'is', 'small']
|
||||
>>> test_sentence.alignment
|
||||
Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
|
||||
"""
|
||||
Train on ``sentence_aligned_corpus`` and create a lexical
|
||||
translation model and an alignment model.
|
||||
|
||||
Translation direction is from ``AlignedSent.mots`` to
|
||||
``AlignedSent.words``.
|
||||
|
||||
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
|
||||
:type sentence_aligned_corpus: list(AlignedSent)
|
||||
|
||||
:param iterations: Number of iterations to run training algorithm
|
||||
:type iterations: int
|
||||
|
||||
:param probability_tables: Optional. Use this to pass in custom
|
||||
probability values. If not specified, probabilities will be
|
||||
set to a uniform distribution, or some other sensible value.
|
||||
If specified, all the following entries must be present:
|
||||
``translation_table``, ``alignment_table``.
|
||||
See ``IBMModel`` for the type and purpose of these tables.
|
||||
:type probability_tables: dict[str]: object
|
||||
"""
|
||||
super().__init__(sentence_aligned_corpus)
|
||||
|
||||
if probability_tables is None:
|
||||
# Get translation probabilities from IBM Model 1
|
||||
# Run more iterations of training for Model 1, since it is
|
||||
# faster than Model 2
|
||||
ibm1 = IBMModel1(sentence_aligned_corpus, 2 * iterations)
|
||||
self.translation_table = ibm1.translation_table
|
||||
self.set_uniform_probabilities(sentence_aligned_corpus)
|
||||
else:
|
||||
# Set user-defined probabilities
|
||||
self.translation_table = probability_tables["translation_table"]
|
||||
self.alignment_table = probability_tables["alignment_table"]
|
||||
|
||||
for n in range(0, iterations):
|
||||
self.train(sentence_aligned_corpus)
|
||||
|
||||
self.align_all(sentence_aligned_corpus)
|
||||
|
||||
def set_uniform_probabilities(self, sentence_aligned_corpus):
|
||||
# a(i | j,l,m) = 1 / (l+1) for all i, j, l, m
|
||||
l_m_combinations = set()
|
||||
for aligned_sentence in sentence_aligned_corpus:
|
||||
l = len(aligned_sentence.mots)
|
||||
m = len(aligned_sentence.words)
|
||||
if (l, m) not in l_m_combinations:
|
||||
l_m_combinations.add((l, m))
|
||||
initial_prob = 1 / (l + 1)
|
||||
if initial_prob < IBMModel.MIN_PROB:
|
||||
warnings.warn(
|
||||
"A source sentence is too long ("
|
||||
+ str(l)
|
||||
+ " words). Results may be less accurate."
|
||||
)
|
||||
|
||||
for i in range(0, l + 1):
|
||||
for j in range(1, m + 1):
|
||||
self.alignment_table[i][j][l][m] = initial_prob
|
||||
|
||||
def train(self, parallel_corpus):
|
||||
counts = Model2Counts()
|
||||
for aligned_sentence in parallel_corpus:
|
||||
src_sentence = [None] + aligned_sentence.mots
|
||||
trg_sentence = ["UNUSED"] + aligned_sentence.words # 1-indexed
|
||||
l = len(aligned_sentence.mots)
|
||||
m = len(aligned_sentence.words)
|
||||
|
||||
# E step (a): Compute normalization factors to weigh counts
|
||||
total_count = self.prob_all_alignments(src_sentence, trg_sentence)
|
||||
|
||||
# E step (b): Collect counts
|
||||
for j in range(1, m + 1):
|
||||
t = trg_sentence[j]
|
||||
for i in range(0, l + 1):
|
||||
s = src_sentence[i]
|
||||
count = self.prob_alignment_point(i, j, src_sentence, trg_sentence)
|
||||
normalized_count = count / total_count[t]
|
||||
|
||||
counts.update_lexical_translation(normalized_count, s, t)
|
||||
counts.update_alignment(normalized_count, i, j, l, m)
|
||||
|
||||
# M step: Update probabilities with maximum likelihood estimates
|
||||
self.maximize_lexical_translation_probabilities(counts)
|
||||
self.maximize_alignment_probabilities(counts)
|
||||
|
||||
def maximize_alignment_probabilities(self, counts):
|
||||
MIN_PROB = IBMModel.MIN_PROB
|
||||
for i, j_s in counts.alignment.items():
|
||||
for j, src_sentence_lengths in j_s.items():
|
||||
for l, trg_sentence_lengths in src_sentence_lengths.items():
|
||||
for m in trg_sentence_lengths:
|
||||
estimate = (
|
||||
counts.alignment[i][j][l][m]
|
||||
/ counts.alignment_for_any_i[j][l][m]
|
||||
)
|
||||
self.alignment_table[i][j][l][m] = max(estimate, MIN_PROB)
|
||||
|
||||
def prob_all_alignments(self, src_sentence, trg_sentence):
|
||||
"""
|
||||
Computes the probability of all possible word alignments,
|
||||
expressed as a marginal distribution over target words t
|
||||
|
||||
Each entry in the return value represents the contribution to
|
||||
the total alignment probability by the target word t.
|
||||
|
||||
To obtain probability(alignment | src_sentence, trg_sentence),
|
||||
simply sum the entries in the return value.
|
||||
|
||||
:return: Probability of t for all s in ``src_sentence``
|
||||
:rtype: dict(str): float
|
||||
"""
|
||||
alignment_prob_for_t = defaultdict(float)
|
||||
for j in range(1, len(trg_sentence)):
|
||||
t = trg_sentence[j]
|
||||
for i in range(0, len(src_sentence)):
|
||||
alignment_prob_for_t[t] += self.prob_alignment_point(
|
||||
i, j, src_sentence, trg_sentence
|
||||
)
|
||||
return alignment_prob_for_t
|
||||
|
||||
def prob_alignment_point(self, i, j, src_sentence, trg_sentence):
|
||||
"""
|
||||
Probability that position j in ``trg_sentence`` is aligned to
|
||||
position i in the ``src_sentence``
|
||||
"""
|
||||
l = len(src_sentence) - 1
|
||||
m = len(trg_sentence) - 1
|
||||
s = src_sentence[i]
|
||||
t = trg_sentence[j]
|
||||
return self.translation_table[t][s] * self.alignment_table[i][j][l][m]
|
||||
|
||||
def prob_t_a_given_s(self, alignment_info):
|
||||
"""
|
||||
Probability of target sentence and an alignment given the
|
||||
source sentence
|
||||
"""
|
||||
prob = 1.0
|
||||
l = len(alignment_info.src_sentence) - 1
|
||||
m = len(alignment_info.trg_sentence) - 1
|
||||
|
||||
for j, i in enumerate(alignment_info.alignment):
|
||||
if j == 0:
|
||||
continue # skip the dummy zeroeth element
|
||||
trg_word = alignment_info.trg_sentence[j]
|
||||
src_word = alignment_info.src_sentence[i]
|
||||
prob *= (
|
||||
self.translation_table[trg_word][src_word]
|
||||
* self.alignment_table[i][j][l][m]
|
||||
)
|
||||
|
||||
return max(prob, IBMModel.MIN_PROB)
|
||||
|
||||
def align_all(self, parallel_corpus):
|
||||
for sentence_pair in parallel_corpus:
|
||||
self.align(sentence_pair)
|
||||
|
||||
def align(self, sentence_pair):
|
||||
"""
|
||||
Determines the best word alignment for one sentence pair from
|
||||
the corpus that the model was trained on.
|
||||
|
||||
The best alignment will be set in ``sentence_pair`` when the
|
||||
method returns. In contrast with the internal implementation of
|
||||
IBM models, the word indices in the ``Alignment`` are zero-
|
||||
indexed, not one-indexed.
|
||||
|
||||
:param sentence_pair: A sentence in the source language and its
|
||||
counterpart sentence in the target language
|
||||
:type sentence_pair: AlignedSent
|
||||
"""
|
||||
best_alignment = []
|
||||
|
||||
l = len(sentence_pair.mots)
|
||||
m = len(sentence_pair.words)
|
||||
|
||||
for j, trg_word in enumerate(sentence_pair.words):
|
||||
# Initialize trg_word to align with the NULL token
|
||||
best_prob = (
|
||||
self.translation_table[trg_word][None]
|
||||
* self.alignment_table[0][j + 1][l][m]
|
||||
)
|
||||
best_prob = max(best_prob, IBMModel.MIN_PROB)
|
||||
best_alignment_point = None
|
||||
for i, src_word in enumerate(sentence_pair.mots):
|
||||
align_prob = (
|
||||
self.translation_table[trg_word][src_word]
|
||||
* self.alignment_table[i + 1][j + 1][l][m]
|
||||
)
|
||||
if align_prob >= best_prob:
|
||||
best_prob = align_prob
|
||||
best_alignment_point = i
|
||||
|
||||
best_alignment.append((j, best_alignment_point))
|
||||
|
||||
sentence_pair.alignment = Alignment(best_alignment)
|
||||
|
||||
|
||||
class Model2Counts(Counts):
|
||||
"""
|
||||
Data object to store counts of various parameters during training.
|
||||
Includes counts for alignment.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.alignment = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
|
||||
)
|
||||
self.alignment_for_any_i = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(float))
|
||||
)
|
||||
|
||||
def update_lexical_translation(self, count, s, t):
|
||||
self.t_given_s[t][s] += count
|
||||
self.any_t_given_s[s] += count
|
||||
|
||||
def update_alignment(self, count, i, j, l, m):
|
||||
self.alignment[i][j][l][m] += count
|
||||
self.alignment_for_any_i[j][l][m] += count
|
||||
346
backend/venv/Lib/site-packages/nltk/translate/ibm3.py
Normal file
346
backend/venv/Lib/site-packages/nltk/translate/ibm3.py
Normal file
@@ -0,0 +1,346 @@
|
||||
# Natural Language Toolkit: IBM Model 3
|
||||
#
|
||||
# Copyright (C) 2001-2013 NLTK Project
|
||||
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Translation model that considers how a word can be aligned to
|
||||
multiple words in another language.
|
||||
|
||||
IBM Model 3 improves on Model 2 by directly modeling the phenomenon
|
||||
where a word in one language may be translated into zero or more words
|
||||
in another. This is expressed by the fertility probability,
|
||||
n(phi | source word).
|
||||
|
||||
If a source word translates into more than one word, it is possible to
|
||||
generate sentences that have the same alignment in multiple ways. This
|
||||
is modeled by a distortion step. The distortion probability, d(j|i,l,m),
|
||||
predicts a target word position, given its aligned source word's
|
||||
position. The distortion probability replaces the alignment probability
|
||||
of Model 2.
|
||||
|
||||
The fertility probability is not applicable for NULL. Target words that
|
||||
align to NULL are assumed to be distributed uniformly in the target
|
||||
sentence. The existence of these words is modeled by p1, the probability
|
||||
that a target word produced by a real source word requires another
|
||||
target word that is produced by NULL.
|
||||
|
||||
The EM algorithm used in Model 3 is:
|
||||
|
||||
:E step: In the training data, collect counts, weighted by prior
|
||||
probabilities.
|
||||
|
||||
- (a) count how many times a source language word is translated
|
||||
into a target language word
|
||||
- (b) count how many times a particular position in the target
|
||||
sentence is aligned to a particular position in the source
|
||||
sentence
|
||||
- (c) count how many times a source word is aligned to phi number
|
||||
of target words
|
||||
- (d) count how many times NULL is aligned to a target word
|
||||
|
||||
:M step: Estimate new probabilities based on the counts from the E step
|
||||
|
||||
Because there are too many possible alignments, only the most probable
|
||||
ones are considered. First, the best alignment is determined using prior
|
||||
probabilities. Then, a hill climbing approach is used to find other good
|
||||
candidates.
|
||||
|
||||
Notations
|
||||
---------
|
||||
|
||||
:i: Position in the source sentence
|
||||
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
|
||||
:j: Position in the target sentence
|
||||
Valid values are 1, 2, ..., length of target sentence
|
||||
:l: Number of words in the source sentence, excluding NULL
|
||||
:m: Number of words in the target sentence
|
||||
:s: A word in the source language
|
||||
:t: A word in the target language
|
||||
:phi: Fertility, the number of target words produced by a source word
|
||||
:p1: Probability that a target word produced by a source word is
|
||||
accompanied by another target word that is aligned to NULL
|
||||
:p0: 1 - p1
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
|
||||
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
|
||||
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
|
||||
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
|
||||
263-311.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from math import factorial
|
||||
|
||||
from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel2
|
||||
from nltk.translate.ibm_model import Counts
|
||||
|
||||
|
||||
class IBMModel3(IBMModel):
|
||||
"""
|
||||
Translation model that considers how a word can be aligned to
|
||||
multiple words in another language
|
||||
|
||||
>>> bitext = []
|
||||
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
|
||||
|
||||
>>> ibm3 = IBMModel3(bitext, 5)
|
||||
|
||||
>>> print(round(ibm3.translation_table['buch']['book'], 3))
|
||||
1.0
|
||||
>>> print(round(ibm3.translation_table['das']['book'], 3))
|
||||
0.0
|
||||
>>> print(round(ibm3.translation_table['ja'][None], 3))
|
||||
1.0
|
||||
|
||||
>>> print(round(ibm3.distortion_table[1][1][2][2], 3))
|
||||
1.0
|
||||
>>> print(round(ibm3.distortion_table[1][2][2][2], 3))
|
||||
0.0
|
||||
>>> print(round(ibm3.distortion_table[2][2][4][5], 3))
|
||||
0.75
|
||||
|
||||
>>> print(round(ibm3.fertility_table[2]['summarize'], 3))
|
||||
1.0
|
||||
>>> print(round(ibm3.fertility_table[1]['book'], 3))
|
||||
1.0
|
||||
|
||||
>>> print(round(ibm3.p1, 3))
|
||||
0.054
|
||||
|
||||
>>> test_sentence = bitext[2]
|
||||
>>> test_sentence.words
|
||||
['das', 'buch', 'ist', 'ja', 'klein']
|
||||
>>> test_sentence.mots
|
||||
['the', 'book', 'is', 'small']
|
||||
>>> test_sentence.alignment
|
||||
Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
|
||||
"""
|
||||
Train on ``sentence_aligned_corpus`` and create a lexical
|
||||
translation model, a distortion model, a fertility model, and a
|
||||
model for generating NULL-aligned words.
|
||||
|
||||
Translation direction is from ``AlignedSent.mots`` to
|
||||
``AlignedSent.words``.
|
||||
|
||||
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
|
||||
:type sentence_aligned_corpus: list(AlignedSent)
|
||||
|
||||
:param iterations: Number of iterations to run training algorithm
|
||||
:type iterations: int
|
||||
|
||||
:param probability_tables: Optional. Use this to pass in custom
|
||||
probability values. If not specified, probabilities will be
|
||||
set to a uniform distribution, or some other sensible value.
|
||||
If specified, all the following entries must be present:
|
||||
``translation_table``, ``alignment_table``,
|
||||
``fertility_table``, ``p1``, ``distortion_table``.
|
||||
See ``IBMModel`` for the type and purpose of these tables.
|
||||
:type probability_tables: dict[str]: object
|
||||
"""
|
||||
super().__init__(sentence_aligned_corpus)
|
||||
self.reset_probabilities()
|
||||
|
||||
if probability_tables is None:
|
||||
# Get translation and alignment probabilities from IBM Model 2
|
||||
ibm2 = IBMModel2(sentence_aligned_corpus, iterations)
|
||||
self.translation_table = ibm2.translation_table
|
||||
self.alignment_table = ibm2.alignment_table
|
||||
self.set_uniform_probabilities(sentence_aligned_corpus)
|
||||
else:
|
||||
# Set user-defined probabilities
|
||||
self.translation_table = probability_tables["translation_table"]
|
||||
self.alignment_table = probability_tables["alignment_table"]
|
||||
self.fertility_table = probability_tables["fertility_table"]
|
||||
self.p1 = probability_tables["p1"]
|
||||
self.distortion_table = probability_tables["distortion_table"]
|
||||
|
||||
for n in range(0, iterations):
|
||||
self.train(sentence_aligned_corpus)
|
||||
|
||||
def reset_probabilities(self):
|
||||
super().reset_probabilities()
|
||||
self.distortion_table = defaultdict(
|
||||
lambda: defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
|
||||
)
|
||||
)
|
||||
"""
|
||||
dict[int][int][int][int]: float. Probability(j | i,l,m).
|
||||
Values accessed as ``distortion_table[j][i][l][m]``.
|
||||
"""
|
||||
|
||||
def set_uniform_probabilities(self, sentence_aligned_corpus):
|
||||
# d(j | i,l,m) = 1 / m for all i, j, l, m
|
||||
l_m_combinations = set()
|
||||
for aligned_sentence in sentence_aligned_corpus:
|
||||
l = len(aligned_sentence.mots)
|
||||
m = len(aligned_sentence.words)
|
||||
if (l, m) not in l_m_combinations:
|
||||
l_m_combinations.add((l, m))
|
||||
initial_prob = 1 / m
|
||||
if initial_prob < IBMModel.MIN_PROB:
|
||||
warnings.warn(
|
||||
"A target sentence is too long ("
|
||||
+ str(m)
|
||||
+ " words). Results may be less accurate."
|
||||
)
|
||||
for j in range(1, m + 1):
|
||||
for i in range(0, l + 1):
|
||||
self.distortion_table[j][i][l][m] = initial_prob
|
||||
|
||||
# simple initialization, taken from GIZA++
|
||||
self.fertility_table[0] = defaultdict(lambda: 0.2)
|
||||
self.fertility_table[1] = defaultdict(lambda: 0.65)
|
||||
self.fertility_table[2] = defaultdict(lambda: 0.1)
|
||||
self.fertility_table[3] = defaultdict(lambda: 0.04)
|
||||
MAX_FERTILITY = 10
|
||||
initial_fert_prob = 0.01 / (MAX_FERTILITY - 4)
|
||||
for phi in range(4, MAX_FERTILITY):
|
||||
self.fertility_table[phi] = defaultdict(lambda: initial_fert_prob)
|
||||
|
||||
self.p1 = 0.5
|
||||
|
||||
def train(self, parallel_corpus):
|
||||
counts = Model3Counts()
|
||||
for aligned_sentence in parallel_corpus:
|
||||
l = len(aligned_sentence.mots)
|
||||
m = len(aligned_sentence.words)
|
||||
|
||||
# Sample the alignment space
|
||||
sampled_alignments, best_alignment = self.sample(aligned_sentence)
|
||||
# Record the most probable alignment
|
||||
aligned_sentence.alignment = Alignment(
|
||||
best_alignment.zero_indexed_alignment()
|
||||
)
|
||||
|
||||
# E step (a): Compute normalization factors to weigh counts
|
||||
total_count = self.prob_of_alignments(sampled_alignments)
|
||||
|
||||
# E step (b): Collect counts
|
||||
for alignment_info in sampled_alignments:
|
||||
count = self.prob_t_a_given_s(alignment_info)
|
||||
normalized_count = count / total_count
|
||||
|
||||
for j in range(1, m + 1):
|
||||
counts.update_lexical_translation(
|
||||
normalized_count, alignment_info, j
|
||||
)
|
||||
counts.update_distortion(normalized_count, alignment_info, j, l, m)
|
||||
|
||||
counts.update_null_generation(normalized_count, alignment_info)
|
||||
counts.update_fertility(normalized_count, alignment_info)
|
||||
|
||||
# M step: Update probabilities with maximum likelihood estimates
|
||||
# If any probability is less than MIN_PROB, clamp it to MIN_PROB
|
||||
existing_alignment_table = self.alignment_table
|
||||
self.reset_probabilities()
|
||||
self.alignment_table = existing_alignment_table # don't retrain
|
||||
|
||||
self.maximize_lexical_translation_probabilities(counts)
|
||||
self.maximize_distortion_probabilities(counts)
|
||||
self.maximize_fertility_probabilities(counts)
|
||||
self.maximize_null_generation_probabilities(counts)
|
||||
|
||||
def maximize_distortion_probabilities(self, counts):
|
||||
MIN_PROB = IBMModel.MIN_PROB
|
||||
for j, i_s in counts.distortion.items():
|
||||
for i, src_sentence_lengths in i_s.items():
|
||||
for l, trg_sentence_lengths in src_sentence_lengths.items():
|
||||
for m in trg_sentence_lengths:
|
||||
estimate = (
|
||||
counts.distortion[j][i][l][m]
|
||||
/ counts.distortion_for_any_j[i][l][m]
|
||||
)
|
||||
self.distortion_table[j][i][l][m] = max(estimate, MIN_PROB)
|
||||
|
||||
def prob_t_a_given_s(self, alignment_info):
|
||||
"""
|
||||
Probability of target sentence and an alignment given the
|
||||
source sentence
|
||||
"""
|
||||
src_sentence = alignment_info.src_sentence
|
||||
trg_sentence = alignment_info.trg_sentence
|
||||
l = len(src_sentence) - 1 # exclude NULL
|
||||
m = len(trg_sentence) - 1
|
||||
p1 = self.p1
|
||||
p0 = 1 - p1
|
||||
|
||||
probability = 1.0
|
||||
MIN_PROB = IBMModel.MIN_PROB
|
||||
|
||||
# Combine NULL insertion probability
|
||||
null_fertility = alignment_info.fertility_of_i(0)
|
||||
probability *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
# Compute combination (m - null_fertility) choose null_fertility
|
||||
for i in range(1, null_fertility + 1):
|
||||
probability *= (m - null_fertility - i + 1) / i
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
# Combine fertility probabilities
|
||||
for i in range(1, l + 1):
|
||||
fertility = alignment_info.fertility_of_i(i)
|
||||
probability *= (
|
||||
factorial(fertility) * self.fertility_table[fertility][src_sentence[i]]
|
||||
)
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
# Combine lexical and distortion probabilities
|
||||
for j in range(1, m + 1):
|
||||
t = trg_sentence[j]
|
||||
i = alignment_info.alignment[j]
|
||||
s = src_sentence[i]
|
||||
|
||||
probability *= (
|
||||
self.translation_table[t][s] * self.distortion_table[j][i][l][m]
|
||||
)
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
return probability
|
||||
|
||||
|
||||
class Model3Counts(Counts):
|
||||
"""
|
||||
Data object to store counts of various parameters during training.
|
||||
Includes counts for distortion.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.distortion = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
|
||||
)
|
||||
self.distortion_for_any_j = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(float))
|
||||
)
|
||||
|
||||
def update_distortion(self, count, alignment_info, j, l, m):
|
||||
i = alignment_info.alignment[j]
|
||||
self.distortion[j][i][l][m] += count
|
||||
self.distortion_for_any_j[i][l][m] += count
|
||||
490
backend/venv/Lib/site-packages/nltk/translate/ibm4.py
Normal file
490
backend/venv/Lib/site-packages/nltk/translate/ibm4.py
Normal file
@@ -0,0 +1,490 @@
|
||||
# Natural Language Toolkit: IBM Model 4
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Translation model that reorders output words based on their type and
|
||||
distance from other related words in the output sentence.
|
||||
|
||||
IBM Model 4 improves the distortion model of Model 3, motivated by the
|
||||
observation that certain words tend to be re-ordered in a predictable
|
||||
way relative to one another. For example, <adjective><noun> in English
|
||||
usually has its order flipped as <noun><adjective> in French.
|
||||
|
||||
Model 4 requires words in the source and target vocabularies to be
|
||||
categorized into classes. This can be linguistically driven, like parts
|
||||
of speech (adjective, nouns, prepositions, etc). Word classes can also
|
||||
be obtained by statistical methods. The original IBM Model 4 uses an
|
||||
information theoretic approach to group words into 50 classes for each
|
||||
vocabulary.
|
||||
|
||||
Terminology
|
||||
-----------
|
||||
|
||||
:Cept:
|
||||
A source word with non-zero fertility i.e. aligned to one or more
|
||||
target words.
|
||||
:Tablet:
|
||||
The set of target word(s) aligned to a cept.
|
||||
:Head of cept:
|
||||
The first word of the tablet of that cept.
|
||||
:Center of cept:
|
||||
The average position of the words in that cept's tablet. If the
|
||||
value is not an integer, the ceiling is taken.
|
||||
For example, for a tablet with words in positions 2, 5, 6 in the
|
||||
target sentence, the center of the corresponding cept is
|
||||
ceil((2 + 5 + 6) / 3) = 5
|
||||
:Displacement:
|
||||
For a head word, defined as (position of head word - position of
|
||||
previous cept's center). Can be positive or negative.
|
||||
For a non-head word, defined as (position of non-head word -
|
||||
position of previous word in the same tablet). Always positive,
|
||||
because successive words in a tablet are assumed to appear to the
|
||||
right of the previous word.
|
||||
|
||||
In contrast to Model 3 which reorders words in a tablet independently of
|
||||
other words, Model 4 distinguishes between three cases.
|
||||
|
||||
1. Words generated by NULL are distributed uniformly.
|
||||
2. For a head word t, its position is modeled by the probability
|
||||
d_head(displacement | word_class_s(s),word_class_t(t)),
|
||||
where s is the previous cept, and word_class_s and word_class_t maps
|
||||
s and t to a source and target language word class respectively.
|
||||
3. For a non-head word t, its position is modeled by the probability
|
||||
d_non_head(displacement | word_class_t(t))
|
||||
|
||||
The EM algorithm used in Model 4 is:
|
||||
|
||||
:E step: In the training data, collect counts, weighted by prior
|
||||
probabilities.
|
||||
|
||||
- (a) count how many times a source language word is translated
|
||||
into a target language word
|
||||
- (b) for a particular word class, count how many times a head
|
||||
word is located at a particular displacement from the
|
||||
previous cept's center
|
||||
- (c) for a particular word class, count how many times a
|
||||
non-head word is located at a particular displacement from
|
||||
the previous target word
|
||||
- (d) count how many times a source word is aligned to phi number
|
||||
of target words
|
||||
- (e) count how many times NULL is aligned to a target word
|
||||
|
||||
:M step: Estimate new probabilities based on the counts from the E step
|
||||
|
||||
Like Model 3, there are too many possible alignments to consider. Thus,
|
||||
a hill climbing approach is used to sample good candidates.
|
||||
|
||||
Notations
|
||||
---------
|
||||
|
||||
:i: Position in the source sentence
|
||||
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
|
||||
:j: Position in the target sentence
|
||||
Valid values are 1, 2, ..., length of target sentence
|
||||
:l: Number of words in the source sentence, excluding NULL
|
||||
:m: Number of words in the target sentence
|
||||
:s: A word in the source language
|
||||
:t: A word in the target language
|
||||
:phi: Fertility, the number of target words produced by a source word
|
||||
:p1: Probability that a target word produced by a source word is
|
||||
accompanied by another target word that is aligned to NULL
|
||||
:p0: 1 - p1
|
||||
:dj: Displacement, Δj
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
|
||||
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
|
||||
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
|
||||
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
|
||||
263-311.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from math import factorial
|
||||
|
||||
from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel3
|
||||
from nltk.translate.ibm_model import Counts, longest_target_sentence_length
|
||||
|
||||
|
||||
class IBMModel4(IBMModel):
|
||||
"""
|
||||
Translation model that reorders output words based on their type and
|
||||
their distance from other related words in the output sentence
|
||||
|
||||
>>> bitext = []
|
||||
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
|
||||
>>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 }
|
||||
>>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
|
||||
|
||||
>>> ibm4 = IBMModel4(bitext, 5, src_classes, trg_classes)
|
||||
|
||||
>>> print(round(ibm4.translation_table['buch']['book'], 3))
|
||||
1.0
|
||||
>>> print(round(ibm4.translation_table['das']['book'], 3))
|
||||
0.0
|
||||
>>> print(round(ibm4.translation_table['ja'][None], 3))
|
||||
1.0
|
||||
|
||||
>>> print(round(ibm4.head_distortion_table[1][0][1], 3))
|
||||
1.0
|
||||
>>> print(round(ibm4.head_distortion_table[2][0][1], 3))
|
||||
0.0
|
||||
>>> print(round(ibm4.non_head_distortion_table[3][6], 3))
|
||||
0.5
|
||||
|
||||
>>> print(round(ibm4.fertility_table[2]['summarize'], 3))
|
||||
1.0
|
||||
>>> print(round(ibm4.fertility_table[1]['book'], 3))
|
||||
1.0
|
||||
|
||||
>>> print(round(ibm4.p1, 3))
|
||||
0.033
|
||||
|
||||
>>> test_sentence = bitext[2]
|
||||
>>> test_sentence.words
|
||||
['das', 'buch', 'ist', 'ja', 'klein']
|
||||
>>> test_sentence.mots
|
||||
['the', 'book', 'is', 'small']
|
||||
>>> test_sentence.alignment
|
||||
Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sentence_aligned_corpus,
|
||||
iterations,
|
||||
source_word_classes,
|
||||
target_word_classes,
|
||||
probability_tables=None,
|
||||
):
|
||||
"""
|
||||
Train on ``sentence_aligned_corpus`` and create a lexical
|
||||
translation model, distortion models, a fertility model, and a
|
||||
model for generating NULL-aligned words.
|
||||
|
||||
Translation direction is from ``AlignedSent.mots`` to
|
||||
``AlignedSent.words``.
|
||||
|
||||
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
|
||||
:type sentence_aligned_corpus: list(AlignedSent)
|
||||
|
||||
:param iterations: Number of iterations to run training algorithm
|
||||
:type iterations: int
|
||||
|
||||
:param source_word_classes: Lookup table that maps a source word
|
||||
to its word class, the latter represented by an integer id
|
||||
:type source_word_classes: dict[str]: int
|
||||
|
||||
:param target_word_classes: Lookup table that maps a target word
|
||||
to its word class, the latter represented by an integer id
|
||||
:type target_word_classes: dict[str]: int
|
||||
|
||||
:param probability_tables: Optional. Use this to pass in custom
|
||||
probability values. If not specified, probabilities will be
|
||||
set to a uniform distribution, or some other sensible value.
|
||||
If specified, all the following entries must be present:
|
||||
``translation_table``, ``alignment_table``,
|
||||
``fertility_table``, ``p1``, ``head_distortion_table``,
|
||||
``non_head_distortion_table``. See ``IBMModel`` and
|
||||
``IBMModel4`` for the type and purpose of these tables.
|
||||
:type probability_tables: dict[str]: object
|
||||
"""
|
||||
super().__init__(sentence_aligned_corpus)
|
||||
self.reset_probabilities()
|
||||
self.src_classes = source_word_classes
|
||||
self.trg_classes = target_word_classes
|
||||
|
||||
if probability_tables is None:
|
||||
# Get probabilities from IBM model 3
|
||||
ibm3 = IBMModel3(sentence_aligned_corpus, iterations)
|
||||
self.translation_table = ibm3.translation_table
|
||||
self.alignment_table = ibm3.alignment_table
|
||||
self.fertility_table = ibm3.fertility_table
|
||||
self.p1 = ibm3.p1
|
||||
self.set_uniform_probabilities(sentence_aligned_corpus)
|
||||
else:
|
||||
# Set user-defined probabilities
|
||||
self.translation_table = probability_tables["translation_table"]
|
||||
self.alignment_table = probability_tables["alignment_table"]
|
||||
self.fertility_table = probability_tables["fertility_table"]
|
||||
self.p1 = probability_tables["p1"]
|
||||
self.head_distortion_table = probability_tables["head_distortion_table"]
|
||||
self.non_head_distortion_table = probability_tables[
|
||||
"non_head_distortion_table"
|
||||
]
|
||||
|
||||
for n in range(0, iterations):
|
||||
self.train(sentence_aligned_corpus)
|
||||
|
||||
def reset_probabilities(self):
|
||||
super().reset_probabilities()
|
||||
self.head_distortion_table = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
|
||||
)
|
||||
"""
|
||||
dict[int][int][int]: float. Probability(displacement of head
|
||||
word | word class of previous cept,target word class).
|
||||
Values accessed as ``distortion_table[dj][src_class][trg_class]``.
|
||||
"""
|
||||
|
||||
self.non_head_distortion_table = defaultdict(
|
||||
lambda: defaultdict(lambda: self.MIN_PROB)
|
||||
)
|
||||
"""
|
||||
dict[int][int]: float. Probability(displacement of non-head
|
||||
word | target word class).
|
||||
Values accessed as ``distortion_table[dj][trg_class]``.
|
||||
"""
|
||||
|
||||
def set_uniform_probabilities(self, sentence_aligned_corpus):
|
||||
"""
|
||||
Set distortion probabilities uniformly to
|
||||
1 / cardinality of displacement values
|
||||
"""
|
||||
max_m = longest_target_sentence_length(sentence_aligned_corpus)
|
||||
|
||||
# The maximum displacement is m-1, when a word is in the last
|
||||
# position m of the target sentence and the previously placed
|
||||
# word is in the first position.
|
||||
# Conversely, the minimum displacement is -(m-1).
|
||||
# Thus, the displacement range is (m-1) - (-(m-1)). Note that
|
||||
# displacement cannot be zero and is not included in the range.
|
||||
if max_m <= 1:
|
||||
initial_prob = IBMModel.MIN_PROB
|
||||
else:
|
||||
initial_prob = 1 / (2 * (max_m - 1))
|
||||
if initial_prob < IBMModel.MIN_PROB:
|
||||
warnings.warn(
|
||||
"A target sentence is too long ("
|
||||
+ str(max_m)
|
||||
+ " words). Results may be less accurate."
|
||||
)
|
||||
|
||||
for dj in range(1, max_m):
|
||||
self.head_distortion_table[dj] = defaultdict(
|
||||
lambda: defaultdict(lambda: initial_prob)
|
||||
)
|
||||
self.head_distortion_table[-dj] = defaultdict(
|
||||
lambda: defaultdict(lambda: initial_prob)
|
||||
)
|
||||
self.non_head_distortion_table[dj] = defaultdict(lambda: initial_prob)
|
||||
self.non_head_distortion_table[-dj] = defaultdict(lambda: initial_prob)
|
||||
|
||||
def train(self, parallel_corpus):
|
||||
counts = Model4Counts()
|
||||
for aligned_sentence in parallel_corpus:
|
||||
m = len(aligned_sentence.words)
|
||||
|
||||
# Sample the alignment space
|
||||
sampled_alignments, best_alignment = self.sample(aligned_sentence)
|
||||
# Record the most probable alignment
|
||||
aligned_sentence.alignment = Alignment(
|
||||
best_alignment.zero_indexed_alignment()
|
||||
)
|
||||
|
||||
# E step (a): Compute normalization factors to weigh counts
|
||||
total_count = self.prob_of_alignments(sampled_alignments)
|
||||
|
||||
# E step (b): Collect counts
|
||||
for alignment_info in sampled_alignments:
|
||||
count = self.prob_t_a_given_s(alignment_info)
|
||||
normalized_count = count / total_count
|
||||
|
||||
for j in range(1, m + 1):
|
||||
counts.update_lexical_translation(
|
||||
normalized_count, alignment_info, j
|
||||
)
|
||||
counts.update_distortion(
|
||||
normalized_count,
|
||||
alignment_info,
|
||||
j,
|
||||
self.src_classes,
|
||||
self.trg_classes,
|
||||
)
|
||||
|
||||
counts.update_null_generation(normalized_count, alignment_info)
|
||||
counts.update_fertility(normalized_count, alignment_info)
|
||||
|
||||
# M step: Update probabilities with maximum likelihood estimates
|
||||
# If any probability is less than MIN_PROB, clamp it to MIN_PROB
|
||||
existing_alignment_table = self.alignment_table
|
||||
self.reset_probabilities()
|
||||
self.alignment_table = existing_alignment_table # don't retrain
|
||||
|
||||
self.maximize_lexical_translation_probabilities(counts)
|
||||
self.maximize_distortion_probabilities(counts)
|
||||
self.maximize_fertility_probabilities(counts)
|
||||
self.maximize_null_generation_probabilities(counts)
|
||||
|
||||
def maximize_distortion_probabilities(self, counts):
|
||||
head_d_table = self.head_distortion_table
|
||||
for dj, src_classes in counts.head_distortion.items():
|
||||
for s_cls, trg_classes in src_classes.items():
|
||||
for t_cls in trg_classes:
|
||||
estimate = (
|
||||
counts.head_distortion[dj][s_cls][t_cls]
|
||||
/ counts.head_distortion_for_any_dj[s_cls][t_cls]
|
||||
)
|
||||
head_d_table[dj][s_cls][t_cls] = max(estimate, IBMModel.MIN_PROB)
|
||||
|
||||
non_head_d_table = self.non_head_distortion_table
|
||||
for dj, trg_classes in counts.non_head_distortion.items():
|
||||
for t_cls in trg_classes:
|
||||
estimate = (
|
||||
counts.non_head_distortion[dj][t_cls]
|
||||
/ counts.non_head_distortion_for_any_dj[t_cls]
|
||||
)
|
||||
non_head_d_table[dj][t_cls] = max(estimate, IBMModel.MIN_PROB)
|
||||
|
||||
def prob_t_a_given_s(self, alignment_info):
|
||||
"""
|
||||
Probability of target sentence and an alignment given the
|
||||
source sentence
|
||||
"""
|
||||
return IBMModel4.model4_prob_t_a_given_s(alignment_info, self)
|
||||
|
||||
@staticmethod # exposed for Model 5 to use
|
||||
def model4_prob_t_a_given_s(alignment_info, ibm_model):
|
||||
probability = 1.0
|
||||
MIN_PROB = IBMModel.MIN_PROB
|
||||
|
||||
def null_generation_term():
|
||||
# Binomial distribution: B(m - null_fertility, p1)
|
||||
value = 1.0
|
||||
p1 = ibm_model.p1
|
||||
p0 = 1 - p1
|
||||
null_fertility = alignment_info.fertility_of_i(0)
|
||||
m = len(alignment_info.trg_sentence) - 1
|
||||
value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)
|
||||
if value < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
# Combination: (m - null_fertility) choose null_fertility
|
||||
for i in range(1, null_fertility + 1):
|
||||
value *= (m - null_fertility - i + 1) / i
|
||||
return value
|
||||
|
||||
def fertility_term():
|
||||
value = 1.0
|
||||
src_sentence = alignment_info.src_sentence
|
||||
for i in range(1, len(src_sentence)):
|
||||
fertility = alignment_info.fertility_of_i(i)
|
||||
value *= (
|
||||
factorial(fertility)
|
||||
* ibm_model.fertility_table[fertility][src_sentence[i]]
|
||||
)
|
||||
if value < MIN_PROB:
|
||||
return MIN_PROB
|
||||
return value
|
||||
|
||||
def lexical_translation_term(j):
|
||||
t = alignment_info.trg_sentence[j]
|
||||
i = alignment_info.alignment[j]
|
||||
s = alignment_info.src_sentence[i]
|
||||
return ibm_model.translation_table[t][s]
|
||||
|
||||
def distortion_term(j):
|
||||
t = alignment_info.trg_sentence[j]
|
||||
i = alignment_info.alignment[j]
|
||||
if i == 0:
|
||||
# case 1: t is aligned to NULL
|
||||
return 1.0
|
||||
if alignment_info.is_head_word(j):
|
||||
# case 2: t is the first word of a tablet
|
||||
previous_cept = alignment_info.previous_cept(j)
|
||||
src_class = None
|
||||
if previous_cept is not None:
|
||||
previous_s = alignment_info.src_sentence[previous_cept]
|
||||
src_class = ibm_model.src_classes[previous_s]
|
||||
trg_class = ibm_model.trg_classes[t]
|
||||
dj = j - alignment_info.center_of_cept(previous_cept)
|
||||
return ibm_model.head_distortion_table[dj][src_class][trg_class]
|
||||
|
||||
# case 3: t is a subsequent word of a tablet
|
||||
previous_position = alignment_info.previous_in_tablet(j)
|
||||
trg_class = ibm_model.trg_classes[t]
|
||||
dj = j - previous_position
|
||||
return ibm_model.non_head_distortion_table[dj][trg_class]
|
||||
|
||||
# end nested functions
|
||||
|
||||
# Abort computation whenever probability falls below MIN_PROB at
|
||||
# any point, since MIN_PROB can be considered as zero
|
||||
probability *= null_generation_term()
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
probability *= fertility_term()
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
for j in range(1, len(alignment_info.trg_sentence)):
|
||||
probability *= lexical_translation_term(j)
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
probability *= distortion_term(j)
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
return probability
|
||||
|
||||
|
||||
class Model4Counts(Counts):
|
||||
"""
|
||||
Data object to store counts of various parameters during training.
|
||||
Includes counts for distortion.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.head_distortion = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(float))
|
||||
)
|
||||
self.head_distortion_for_any_dj = defaultdict(lambda: defaultdict(float))
|
||||
self.non_head_distortion = defaultdict(lambda: defaultdict(float))
|
||||
self.non_head_distortion_for_any_dj = defaultdict(float)
|
||||
|
||||
def update_distortion(self, count, alignment_info, j, src_classes, trg_classes):
|
||||
i = alignment_info.alignment[j]
|
||||
t = alignment_info.trg_sentence[j]
|
||||
if i == 0:
|
||||
# case 1: t is aligned to NULL
|
||||
pass
|
||||
elif alignment_info.is_head_word(j):
|
||||
# case 2: t is the first word of a tablet
|
||||
previous_cept = alignment_info.previous_cept(j)
|
||||
if previous_cept is not None:
|
||||
previous_src_word = alignment_info.src_sentence[previous_cept]
|
||||
src_class = src_classes[previous_src_word]
|
||||
else:
|
||||
src_class = None
|
||||
trg_class = trg_classes[t]
|
||||
dj = j - alignment_info.center_of_cept(previous_cept)
|
||||
self.head_distortion[dj][src_class][trg_class] += count
|
||||
self.head_distortion_for_any_dj[src_class][trg_class] += count
|
||||
else:
|
||||
# case 3: t is a subsequent word of a tablet
|
||||
previous_j = alignment_info.previous_in_tablet(j)
|
||||
trg_class = trg_classes[t]
|
||||
dj = j - previous_j
|
||||
self.non_head_distortion[dj][trg_class] += count
|
||||
self.non_head_distortion_for_any_dj[trg_class] += count
|
||||
661
backend/venv/Lib/site-packages/nltk/translate/ibm5.py
Normal file
661
backend/venv/Lib/site-packages/nltk/translate/ibm5.py
Normal file
@@ -0,0 +1,661 @@
|
||||
# Natural Language Toolkit: IBM Model 5
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Translation model that keeps track of vacant positions in the target
|
||||
sentence to decide where to place translated words.
|
||||
|
||||
Translation can be viewed as a process where each word in the source
|
||||
sentence is stepped through sequentially, generating translated words
|
||||
for each source word. The target sentence can be viewed as being made
|
||||
up of ``m`` empty slots initially, which gradually fill up as generated
|
||||
words are placed in them.
|
||||
|
||||
Models 3 and 4 use distortion probabilities to decide how to place
|
||||
translated words. For simplicity, these models ignore the history of
|
||||
which slots have already been occupied with translated words.
|
||||
Consider the placement of the last translated word: there is only one
|
||||
empty slot left in the target sentence, so the distortion probability
|
||||
should be 1.0 for that position and 0.0 everywhere else. However, the
|
||||
distortion probabilities for Models 3 and 4 are set up such that all
|
||||
positions are under consideration.
|
||||
|
||||
IBM Model 5 fixes this deficiency by accounting for occupied slots
|
||||
during translation. It introduces the vacancy function v(j), the number
|
||||
of vacancies up to, and including, position j in the target sentence.
|
||||
|
||||
Terminology
|
||||
-----------
|
||||
|
||||
:Maximum vacancy:
|
||||
The number of valid slots that a word can be placed in.
|
||||
This is not necessarily the same as the number of vacant slots.
|
||||
For example, if a tablet contains more than one word, the head word
|
||||
cannot be placed at the last vacant slot because there will be no
|
||||
space for the other words in the tablet. The number of valid slots
|
||||
has to take into account the length of the tablet.
|
||||
Non-head words cannot be placed before the head word, so vacancies
|
||||
to the left of the head word are ignored.
|
||||
:Vacancy difference:
|
||||
For a head word: (v(j) - v(center of previous cept))
|
||||
Can be positive or negative.
|
||||
For a non-head word: (v(j) - v(position of previously placed word))
|
||||
Always positive, because successive words in a tablet are assumed to
|
||||
appear to the right of the previous word.
|
||||
|
||||
Positioning of target words fall under three cases:
|
||||
|
||||
1. Words generated by NULL are distributed uniformly
|
||||
2. For a head word t, its position is modeled by the probability
|
||||
v_head(dv | max_v,word_class_t(t))
|
||||
3. For a non-head word t, its position is modeled by the probability
|
||||
v_non_head(dv | max_v,word_class_t(t))
|
||||
|
||||
dv and max_v are defined differently for head and non-head words.
|
||||
|
||||
The EM algorithm used in Model 5 is:
|
||||
|
||||
:E step: In the training data, collect counts, weighted by prior
|
||||
probabilities.
|
||||
|
||||
- (a) count how many times a source language word is translated
|
||||
into a target language word
|
||||
- (b) for a particular word class and maximum vacancy, count how
|
||||
many times a head word and the previous cept's center have
|
||||
a particular difference in number of vacancies
|
||||
- (b) for a particular word class and maximum vacancy, count how
|
||||
many times a non-head word and the previous target word
|
||||
have a particular difference in number of vacancies
|
||||
- (d) count how many times a source word is aligned to phi number
|
||||
of target words
|
||||
- (e) count how many times NULL is aligned to a target word
|
||||
|
||||
:M step: Estimate new probabilities based on the counts from the E step
|
||||
|
||||
Like Model 4, there are too many possible alignments to consider. Thus,
|
||||
a hill climbing approach is used to sample good candidates. In addition,
|
||||
pruning is used to weed out unlikely alignments based on Model 4 scores.
|
||||
|
||||
Notations
|
||||
---------
|
||||
|
||||
:i: Position in the source sentence
|
||||
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
|
||||
:j: Position in the target sentence
|
||||
Valid values are 1, 2, ..., length of target sentence
|
||||
:l: Number of words in the source sentence, excluding NULL
|
||||
:m: Number of words in the target sentence
|
||||
:s: A word in the source language
|
||||
:t: A word in the target language
|
||||
:phi: Fertility, the number of target words produced by a source word
|
||||
:p1: Probability that a target word produced by a source word is
|
||||
accompanied by another target word that is aligned to NULL
|
||||
:p0: 1 - p1
|
||||
:max_v: Maximum vacancy
|
||||
:dv: Vacancy difference, Δv
|
||||
|
||||
The definition of v_head here differs from GIZA++, section 4.7 of
|
||||
[Brown et al., 1993], and [Koehn, 2010]. In the latter cases, v_head is
|
||||
v_head(v(j) | v(center of previous cept),max_v,word_class(t)).
|
||||
|
||||
Here, we follow appendix B of [Brown et al., 1993] and combine v(j) with
|
||||
v(center of previous cept) to obtain dv:
|
||||
v_head(v(j) - v(center of previous cept) | max_v,word_class(t)).
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
|
||||
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
|
||||
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
|
||||
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
|
||||
263-311.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from math import factorial
|
||||
|
||||
from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel4
|
||||
from nltk.translate.ibm_model import Counts, longest_target_sentence_length
|
||||
|
||||
|
||||
class IBMModel5(IBMModel):
|
||||
"""
|
||||
Translation model that keeps track of vacant positions in the target
|
||||
sentence to decide where to place translated words
|
||||
|
||||
>>> bitext = []
|
||||
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
|
||||
>>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 }
|
||||
>>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
|
||||
|
||||
>>> ibm5 = IBMModel5(bitext, 5, src_classes, trg_classes)
|
||||
|
||||
>>> print(round(ibm5.head_vacancy_table[1][1][1], 3))
|
||||
1.0
|
||||
>>> print(round(ibm5.head_vacancy_table[2][1][1], 3))
|
||||
0.0
|
||||
>>> print(round(ibm5.non_head_vacancy_table[3][3][6], 3))
|
||||
1.0
|
||||
|
||||
>>> print(round(ibm5.fertility_table[2]['summarize'], 3))
|
||||
1.0
|
||||
>>> print(round(ibm5.fertility_table[1]['book'], 3))
|
||||
1.0
|
||||
|
||||
>>> print(round(ibm5.p1, 3))
|
||||
0.033
|
||||
|
||||
>>> test_sentence = bitext[2]
|
||||
>>> test_sentence.words
|
||||
['das', 'buch', 'ist', 'ja', 'klein']
|
||||
>>> test_sentence.mots
|
||||
['the', 'book', 'is', 'small']
|
||||
>>> test_sentence.alignment
|
||||
Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
|
||||
|
||||
"""
|
||||
|
||||
MIN_SCORE_FACTOR = 0.2
|
||||
"""
|
||||
Alignments with scores below this factor are pruned during sampling
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sentence_aligned_corpus,
|
||||
iterations,
|
||||
source_word_classes,
|
||||
target_word_classes,
|
||||
probability_tables=None,
|
||||
):
|
||||
"""
|
||||
Train on ``sentence_aligned_corpus`` and create a lexical
|
||||
translation model, vacancy models, a fertility model, and a
|
||||
model for generating NULL-aligned words.
|
||||
|
||||
Translation direction is from ``AlignedSent.mots`` to
|
||||
``AlignedSent.words``.
|
||||
|
||||
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
|
||||
:type sentence_aligned_corpus: list(AlignedSent)
|
||||
|
||||
:param iterations: Number of iterations to run training algorithm
|
||||
:type iterations: int
|
||||
|
||||
:param source_word_classes: Lookup table that maps a source word
|
||||
to its word class, the latter represented by an integer id
|
||||
:type source_word_classes: dict[str]: int
|
||||
|
||||
:param target_word_classes: Lookup table that maps a target word
|
||||
to its word class, the latter represented by an integer id
|
||||
:type target_word_classes: dict[str]: int
|
||||
|
||||
:param probability_tables: Optional. Use this to pass in custom
|
||||
probability values. If not specified, probabilities will be
|
||||
set to a uniform distribution, or some other sensible value.
|
||||
If specified, all the following entries must be present:
|
||||
``translation_table``, ``alignment_table``,
|
||||
``fertility_table``, ``p1``, ``head_distortion_table``,
|
||||
``non_head_distortion_table``, ``head_vacancy_table``,
|
||||
``non_head_vacancy_table``. See ``IBMModel``, ``IBMModel4``,
|
||||
and ``IBMModel5`` for the type and purpose of these tables.
|
||||
:type probability_tables: dict[str]: object
|
||||
"""
|
||||
super().__init__(sentence_aligned_corpus)
|
||||
self.reset_probabilities()
|
||||
self.src_classes = source_word_classes
|
||||
self.trg_classes = target_word_classes
|
||||
|
||||
if probability_tables is None:
|
||||
# Get probabilities from IBM model 4
|
||||
ibm4 = IBMModel4(
|
||||
sentence_aligned_corpus,
|
||||
iterations,
|
||||
source_word_classes,
|
||||
target_word_classes,
|
||||
)
|
||||
self.translation_table = ibm4.translation_table
|
||||
self.alignment_table = ibm4.alignment_table
|
||||
self.fertility_table = ibm4.fertility_table
|
||||
self.p1 = ibm4.p1
|
||||
self.head_distortion_table = ibm4.head_distortion_table
|
||||
self.non_head_distortion_table = ibm4.non_head_distortion_table
|
||||
self.set_uniform_probabilities(sentence_aligned_corpus)
|
||||
else:
|
||||
# Set user-defined probabilities
|
||||
self.translation_table = probability_tables["translation_table"]
|
||||
self.alignment_table = probability_tables["alignment_table"]
|
||||
self.fertility_table = probability_tables["fertility_table"]
|
||||
self.p1 = probability_tables["p1"]
|
||||
self.head_distortion_table = probability_tables["head_distortion_table"]
|
||||
self.non_head_distortion_table = probability_tables[
|
||||
"non_head_distortion_table"
|
||||
]
|
||||
self.head_vacancy_table = probability_tables["head_vacancy_table"]
|
||||
self.non_head_vacancy_table = probability_tables["non_head_vacancy_table"]
|
||||
|
||||
for n in range(0, iterations):
|
||||
self.train(sentence_aligned_corpus)
|
||||
|
||||
def reset_probabilities(self):
|
||||
super().reset_probabilities()
|
||||
self.head_vacancy_table = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
|
||||
)
|
||||
"""
|
||||
dict[int][int][int]: float. Probability(vacancy difference |
|
||||
number of remaining valid positions,target word class).
|
||||
Values accessed as ``head_vacancy_table[dv][v_max][trg_class]``.
|
||||
"""
|
||||
|
||||
self.non_head_vacancy_table = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
|
||||
)
|
||||
"""
|
||||
dict[int][int][int]: float. Probability(vacancy difference |
|
||||
number of remaining valid positions,target word class).
|
||||
Values accessed as ``non_head_vacancy_table[dv][v_max][trg_class]``.
|
||||
"""
|
||||
|
||||
def set_uniform_probabilities(self, sentence_aligned_corpus):
|
||||
"""
|
||||
Set vacancy probabilities uniformly to
|
||||
1 / cardinality of vacancy difference values
|
||||
"""
|
||||
max_m = longest_target_sentence_length(sentence_aligned_corpus)
|
||||
|
||||
# The maximum vacancy difference occurs when a word is placed in
|
||||
# the last available position m of the target sentence and the
|
||||
# previous word position has no vacancies.
|
||||
# The minimum is 1-max_v, when a word is placed in the first
|
||||
# available position and the previous word is placed beyond the
|
||||
# last available position.
|
||||
# Thus, the number of possible vacancy difference values is
|
||||
# (max_v) - (1-max_v) + 1 = 2 * max_v.
|
||||
if max_m > 0 and (1 / (2 * max_m)) < IBMModel.MIN_PROB:
|
||||
warnings.warn(
|
||||
"A target sentence is too long ("
|
||||
+ str(max_m)
|
||||
+ " words). Results may be less accurate."
|
||||
)
|
||||
|
||||
for max_v in range(1, max_m + 1):
|
||||
for dv in range(1, max_m + 1):
|
||||
initial_prob = 1 / (2 * max_v)
|
||||
self.head_vacancy_table[dv][max_v] = defaultdict(lambda: initial_prob)
|
||||
self.head_vacancy_table[-(dv - 1)][max_v] = defaultdict(
|
||||
lambda: initial_prob
|
||||
)
|
||||
self.non_head_vacancy_table[dv][max_v] = defaultdict(
|
||||
lambda: initial_prob
|
||||
)
|
||||
self.non_head_vacancy_table[-(dv - 1)][max_v] = defaultdict(
|
||||
lambda: initial_prob
|
||||
)
|
||||
|
||||
def train(self, parallel_corpus):
|
||||
counts = Model5Counts()
|
||||
for aligned_sentence in parallel_corpus:
|
||||
l = len(aligned_sentence.mots)
|
||||
m = len(aligned_sentence.words)
|
||||
|
||||
# Sample the alignment space
|
||||
sampled_alignments, best_alignment = self.sample(aligned_sentence)
|
||||
# Record the most probable alignment
|
||||
aligned_sentence.alignment = Alignment(
|
||||
best_alignment.zero_indexed_alignment()
|
||||
)
|
||||
|
||||
# E step (a): Compute normalization factors to weigh counts
|
||||
total_count = self.prob_of_alignments(sampled_alignments)
|
||||
|
||||
# E step (b): Collect counts
|
||||
for alignment_info in sampled_alignments:
|
||||
count = self.prob_t_a_given_s(alignment_info)
|
||||
normalized_count = count / total_count
|
||||
|
||||
for j in range(1, m + 1):
|
||||
counts.update_lexical_translation(
|
||||
normalized_count, alignment_info, j
|
||||
)
|
||||
|
||||
slots = Slots(m)
|
||||
for i in range(1, l + 1):
|
||||
counts.update_vacancy(
|
||||
normalized_count, alignment_info, i, self.trg_classes, slots
|
||||
)
|
||||
|
||||
counts.update_null_generation(normalized_count, alignment_info)
|
||||
counts.update_fertility(normalized_count, alignment_info)
|
||||
|
||||
# M step: Update probabilities with maximum likelihood estimates
|
||||
# If any probability is less than MIN_PROB, clamp it to MIN_PROB
|
||||
existing_alignment_table = self.alignment_table
|
||||
self.reset_probabilities()
|
||||
self.alignment_table = existing_alignment_table # don't retrain
|
||||
|
||||
self.maximize_lexical_translation_probabilities(counts)
|
||||
self.maximize_vacancy_probabilities(counts)
|
||||
self.maximize_fertility_probabilities(counts)
|
||||
self.maximize_null_generation_probabilities(counts)
|
||||
|
||||
def sample(self, sentence_pair):
|
||||
"""
|
||||
Sample the most probable alignments from the entire alignment
|
||||
space according to Model 4
|
||||
|
||||
Note that Model 4 scoring is used instead of Model 5 because the
|
||||
latter is too expensive to compute.
|
||||
|
||||
First, determine the best alignment according to IBM Model 2.
|
||||
With this initial alignment, use hill climbing to determine the
|
||||
best alignment according to a IBM Model 4. Add this
|
||||
alignment and its neighbors to the sample set. Repeat this
|
||||
process with other initial alignments obtained by pegging an
|
||||
alignment point. Finally, prune alignments that have
|
||||
substantially lower Model 4 scores than the best alignment.
|
||||
|
||||
:param sentence_pair: Source and target language sentence pair
|
||||
to generate a sample of alignments from
|
||||
:type sentence_pair: AlignedSent
|
||||
|
||||
:return: A set of best alignments represented by their ``AlignmentInfo``
|
||||
and the best alignment of the set for convenience
|
||||
:rtype: set(AlignmentInfo), AlignmentInfo
|
||||
"""
|
||||
sampled_alignments, best_alignment = super().sample(sentence_pair)
|
||||
return self.prune(sampled_alignments), best_alignment
|
||||
|
||||
def prune(self, alignment_infos):
|
||||
"""
|
||||
Removes alignments from ``alignment_infos`` that have
|
||||
substantially lower Model 4 scores than the best alignment
|
||||
|
||||
:return: Pruned alignments
|
||||
:rtype: set(AlignmentInfo)
|
||||
"""
|
||||
alignments = []
|
||||
best_score = 0
|
||||
|
||||
for alignment_info in alignment_infos:
|
||||
score = IBMModel4.model4_prob_t_a_given_s(alignment_info, self)
|
||||
best_score = max(score, best_score)
|
||||
alignments.append((alignment_info, score))
|
||||
|
||||
threshold = IBMModel5.MIN_SCORE_FACTOR * best_score
|
||||
alignments = [a[0] for a in alignments if a[1] > threshold]
|
||||
return set(alignments)
|
||||
|
||||
def hillclimb(self, alignment_info, j_pegged=None):
|
||||
"""
|
||||
Starting from the alignment in ``alignment_info``, look at
|
||||
neighboring alignments iteratively for the best one, according
|
||||
to Model 4
|
||||
|
||||
Note that Model 4 scoring is used instead of Model 5 because the
|
||||
latter is too expensive to compute.
|
||||
|
||||
There is no guarantee that the best alignment in the alignment
|
||||
space will be found, because the algorithm might be stuck in a
|
||||
local maximum.
|
||||
|
||||
:param j_pegged: If specified, the search will be constrained to
|
||||
alignments where ``j_pegged`` remains unchanged
|
||||
:type j_pegged: int
|
||||
|
||||
:return: The best alignment found from hill climbing
|
||||
:rtype: AlignmentInfo
|
||||
"""
|
||||
alignment = alignment_info # alias with shorter name
|
||||
max_probability = IBMModel4.model4_prob_t_a_given_s(alignment, self)
|
||||
|
||||
while True:
|
||||
old_alignment = alignment
|
||||
for neighbor_alignment in self.neighboring(alignment, j_pegged):
|
||||
neighbor_probability = IBMModel4.model4_prob_t_a_given_s(
|
||||
neighbor_alignment, self
|
||||
)
|
||||
|
||||
if neighbor_probability > max_probability:
|
||||
alignment = neighbor_alignment
|
||||
max_probability = neighbor_probability
|
||||
|
||||
if alignment == old_alignment:
|
||||
# Until there are no better alignments
|
||||
break
|
||||
|
||||
alignment.score = max_probability
|
||||
return alignment
|
||||
|
||||
def prob_t_a_given_s(self, alignment_info):
|
||||
"""
|
||||
Probability of target sentence and an alignment given the
|
||||
source sentence
|
||||
"""
|
||||
probability = 1.0
|
||||
MIN_PROB = IBMModel.MIN_PROB
|
||||
slots = Slots(len(alignment_info.trg_sentence) - 1)
|
||||
|
||||
def null_generation_term():
|
||||
# Binomial distribution: B(m - null_fertility, p1)
|
||||
value = 1.0
|
||||
p1 = self.p1
|
||||
p0 = 1 - p1
|
||||
null_fertility = alignment_info.fertility_of_i(0)
|
||||
m = len(alignment_info.trg_sentence) - 1
|
||||
value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)
|
||||
if value < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
# Combination: (m - null_fertility) choose null_fertility
|
||||
for i in range(1, null_fertility + 1):
|
||||
value *= (m - null_fertility - i + 1) / i
|
||||
return value
|
||||
|
||||
def fertility_term():
|
||||
value = 1.0
|
||||
src_sentence = alignment_info.src_sentence
|
||||
for i in range(1, len(src_sentence)):
|
||||
fertility = alignment_info.fertility_of_i(i)
|
||||
value *= (
|
||||
factorial(fertility)
|
||||
* self.fertility_table[fertility][src_sentence[i]]
|
||||
)
|
||||
if value < MIN_PROB:
|
||||
return MIN_PROB
|
||||
return value
|
||||
|
||||
def lexical_translation_term(j):
|
||||
t = alignment_info.trg_sentence[j]
|
||||
i = alignment_info.alignment[j]
|
||||
s = alignment_info.src_sentence[i]
|
||||
return self.translation_table[t][s]
|
||||
|
||||
def vacancy_term(i):
|
||||
value = 1.0
|
||||
tablet = alignment_info.cepts[i]
|
||||
tablet_length = len(tablet)
|
||||
total_vacancies = slots.vacancies_at(len(slots))
|
||||
|
||||
# case 1: NULL-aligned words
|
||||
if tablet_length == 0:
|
||||
return value
|
||||
|
||||
# case 2: head word
|
||||
j = tablet[0]
|
||||
previous_cept = alignment_info.previous_cept(j)
|
||||
previous_center = alignment_info.center_of_cept(previous_cept)
|
||||
dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center)
|
||||
max_v = total_vacancies - tablet_length + 1
|
||||
trg_class = self.trg_classes[alignment_info.trg_sentence[j]]
|
||||
value *= self.head_vacancy_table[dv][max_v][trg_class]
|
||||
slots.occupy(j) # mark position as occupied
|
||||
total_vacancies -= 1
|
||||
if value < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
# case 3: non-head words
|
||||
for k in range(1, tablet_length):
|
||||
previous_position = tablet[k - 1]
|
||||
previous_vacancies = slots.vacancies_at(previous_position)
|
||||
j = tablet[k]
|
||||
dv = slots.vacancies_at(j) - previous_vacancies
|
||||
max_v = total_vacancies - tablet_length + k + 1 - previous_vacancies
|
||||
trg_class = self.trg_classes[alignment_info.trg_sentence[j]]
|
||||
value *= self.non_head_vacancy_table[dv][max_v][trg_class]
|
||||
slots.occupy(j) # mark position as occupied
|
||||
total_vacancies -= 1
|
||||
if value < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
return value
|
||||
|
||||
# end nested functions
|
||||
|
||||
# Abort computation whenever probability falls below MIN_PROB at
|
||||
# any point, since MIN_PROB can be considered as zero
|
||||
probability *= null_generation_term()
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
probability *= fertility_term()
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
for j in range(1, len(alignment_info.trg_sentence)):
|
||||
probability *= lexical_translation_term(j)
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
for i in range(1, len(alignment_info.src_sentence)):
|
||||
probability *= vacancy_term(i)
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
return probability
|
||||
|
||||
def maximize_vacancy_probabilities(self, counts):
|
||||
MIN_PROB = IBMModel.MIN_PROB
|
||||
head_vacancy_table = self.head_vacancy_table
|
||||
for dv, max_vs in counts.head_vacancy.items():
|
||||
for max_v, trg_classes in max_vs.items():
|
||||
for t_cls in trg_classes:
|
||||
estimate = (
|
||||
counts.head_vacancy[dv][max_v][t_cls]
|
||||
/ counts.head_vacancy_for_any_dv[max_v][t_cls]
|
||||
)
|
||||
head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB)
|
||||
|
||||
non_head_vacancy_table = self.non_head_vacancy_table
|
||||
for dv, max_vs in counts.non_head_vacancy.items():
|
||||
for max_v, trg_classes in max_vs.items():
|
||||
for t_cls in trg_classes:
|
||||
estimate = (
|
||||
counts.non_head_vacancy[dv][max_v][t_cls]
|
||||
/ counts.non_head_vacancy_for_any_dv[max_v][t_cls]
|
||||
)
|
||||
non_head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB)
|
||||
|
||||
|
||||
class Model5Counts(Counts):
|
||||
"""
|
||||
Data object to store counts of various parameters during training.
|
||||
Includes counts for vacancies.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.head_vacancy = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
|
||||
self.head_vacancy_for_any_dv = defaultdict(lambda: defaultdict(float))
|
||||
self.non_head_vacancy = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(float))
|
||||
)
|
||||
self.non_head_vacancy_for_any_dv = defaultdict(lambda: defaultdict(float))
|
||||
|
||||
def update_vacancy(self, count, alignment_info, i, trg_classes, slots):
|
||||
"""
|
||||
:param count: Value to add to the vacancy counts
|
||||
:param alignment_info: Alignment under consideration
|
||||
:param i: Source word position under consideration
|
||||
:param trg_classes: Target word classes
|
||||
:param slots: Vacancy states of the slots in the target sentence.
|
||||
Output parameter that will be modified as new words are placed
|
||||
in the target sentence.
|
||||
"""
|
||||
tablet = alignment_info.cepts[i]
|
||||
tablet_length = len(tablet)
|
||||
total_vacancies = slots.vacancies_at(len(slots))
|
||||
|
||||
# case 1: NULL aligned words
|
||||
if tablet_length == 0:
|
||||
return # ignore zero fertility words
|
||||
|
||||
# case 2: head word
|
||||
j = tablet[0]
|
||||
previous_cept = alignment_info.previous_cept(j)
|
||||
previous_center = alignment_info.center_of_cept(previous_cept)
|
||||
dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center)
|
||||
max_v = total_vacancies - tablet_length + 1
|
||||
trg_class = trg_classes[alignment_info.trg_sentence[j]]
|
||||
self.head_vacancy[dv][max_v][trg_class] += count
|
||||
self.head_vacancy_for_any_dv[max_v][trg_class] += count
|
||||
slots.occupy(j) # mark position as occupied
|
||||
total_vacancies -= 1
|
||||
|
||||
# case 3: non-head words
|
||||
for k in range(1, tablet_length):
|
||||
previous_position = tablet[k - 1]
|
||||
previous_vacancies = slots.vacancies_at(previous_position)
|
||||
j = tablet[k]
|
||||
dv = slots.vacancies_at(j) - previous_vacancies
|
||||
max_v = total_vacancies - tablet_length + k + 1 - previous_vacancies
|
||||
trg_class = trg_classes[alignment_info.trg_sentence[j]]
|
||||
self.non_head_vacancy[dv][max_v][trg_class] += count
|
||||
self.non_head_vacancy_for_any_dv[max_v][trg_class] += count
|
||||
slots.occupy(j) # mark position as occupied
|
||||
total_vacancies -= 1
|
||||
|
||||
|
||||
class Slots:
|
||||
"""
|
||||
Represents positions in a target sentence. Used to keep track of
|
||||
which slot (position) is occupied.
|
||||
"""
|
||||
|
||||
def __init__(self, target_sentence_length):
|
||||
self._slots = [False] * (target_sentence_length + 1) # 1-indexed
|
||||
|
||||
def occupy(self, position):
|
||||
"""
|
||||
:return: Mark slot at ``position`` as occupied
|
||||
"""
|
||||
self._slots[position] = True
|
||||
|
||||
def vacancies_at(self, position):
|
||||
"""
|
||||
:return: Number of vacant slots up to, and including, ``position``
|
||||
"""
|
||||
vacancies = 0
|
||||
for k in range(1, position + 1):
|
||||
if not self._slots[k]:
|
||||
vacancies += 1
|
||||
return vacancies
|
||||
|
||||
def __len__(self):
|
||||
return len(self._slots) - 1 # exclude dummy zeroeth element
|
||||
549
backend/venv/Lib/site-packages/nltk/translate/ibm_model.py
Normal file
549
backend/venv/Lib/site-packages/nltk/translate/ibm_model.py
Normal file
@@ -0,0 +1,549 @@
|
||||
# Natural Language Toolkit: IBM Model Core
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Common methods and classes for all IBM models. See ``IBMModel1``,
|
||||
``IBMModel2``, ``IBMModel3``, ``IBMModel4``, and ``IBMModel5``
|
||||
for specific implementations.
|
||||
|
||||
The IBM models are a series of generative models that learn lexical
|
||||
translation probabilities, p(target language word|source language word),
|
||||
given a sentence-aligned parallel corpus.
|
||||
|
||||
The models increase in sophistication from model 1 to 5. Typically, the
|
||||
output of lower models is used to seed the higher models. All models
|
||||
use the Expectation-Maximization (EM) algorithm to learn various
|
||||
probability tables.
|
||||
|
||||
Words in a sentence are one-indexed. The first word of a sentence has
|
||||
position 1, not 0. Index 0 is reserved in the source sentence for the
|
||||
NULL token. The concept of position does not apply to NULL, but it is
|
||||
indexed at 0 by convention.
|
||||
|
||||
Each target word is aligned to exactly one source word or the NULL
|
||||
token.
|
||||
|
||||
References:
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
|
||||
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
|
||||
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
|
||||
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
|
||||
263-311.
|
||||
"""
|
||||
|
||||
from bisect import insort_left
|
||||
from collections import defaultdict
|
||||
from copy import deepcopy
|
||||
from math import ceil
|
||||
|
||||
|
||||
def longest_target_sentence_length(sentence_aligned_corpus):
|
||||
"""
|
||||
:param sentence_aligned_corpus: Parallel corpus under consideration
|
||||
:type sentence_aligned_corpus: list(AlignedSent)
|
||||
:return: Number of words in the longest target language sentence
|
||||
of ``sentence_aligned_corpus``
|
||||
"""
|
||||
max_m = 0
|
||||
for aligned_sentence in sentence_aligned_corpus:
|
||||
m = len(aligned_sentence.words)
|
||||
max_m = max(m, max_m)
|
||||
return max_m
|
||||
|
||||
|
||||
class IBMModel:
|
||||
"""
|
||||
Abstract base class for all IBM models
|
||||
"""
|
||||
|
||||
# Avoid division by zero and precision errors by imposing a minimum
|
||||
# value for probabilities. Note that this approach is theoretically
|
||||
# incorrect, since it may create probabilities that sum to more
|
||||
# than 1. In practice, the contribution of probabilities with MIN_PROB
|
||||
# is tiny enough that the value of MIN_PROB can be treated as zero.
|
||||
MIN_PROB = 1.0e-12 # GIZA++ is more liberal and uses 1.0e-7
|
||||
|
||||
def __init__(self, sentence_aligned_corpus):
|
||||
self.init_vocab(sentence_aligned_corpus)
|
||||
self.reset_probabilities()
|
||||
|
||||
def reset_probabilities(self):
|
||||
self.translation_table = defaultdict(
|
||||
lambda: defaultdict(lambda: IBMModel.MIN_PROB)
|
||||
)
|
||||
"""
|
||||
dict[str][str]: float. Probability(target word | source word).
|
||||
Values accessed as ``translation_table[target_word][source_word]``.
|
||||
"""
|
||||
|
||||
self.alignment_table = defaultdict(
|
||||
lambda: defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: IBMModel.MIN_PROB))
|
||||
)
|
||||
)
|
||||
"""
|
||||
dict[int][int][int][int]: float. Probability(i | j,l,m).
|
||||
Values accessed as ``alignment_table[i][j][l][m]``.
|
||||
Used in model 2 and hill climbing in models 3 and above
|
||||
"""
|
||||
|
||||
self.fertility_table = defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
|
||||
"""
|
||||
dict[int][str]: float. Probability(fertility | source word).
|
||||
Values accessed as ``fertility_table[fertility][source_word]``.
|
||||
Used in model 3 and higher.
|
||||
"""
|
||||
|
||||
self.p1 = 0.5
|
||||
"""
|
||||
Probability that a generated word requires another target word
|
||||
that is aligned to NULL.
|
||||
Used in model 3 and higher.
|
||||
"""
|
||||
|
||||
def set_uniform_probabilities(self, sentence_aligned_corpus):
|
||||
"""
|
||||
Initialize probability tables to a uniform distribution
|
||||
|
||||
Derived classes should implement this accordingly.
|
||||
"""
|
||||
pass
|
||||
|
||||
def init_vocab(self, sentence_aligned_corpus):
|
||||
src_vocab = set()
|
||||
trg_vocab = set()
|
||||
for aligned_sentence in sentence_aligned_corpus:
|
||||
trg_vocab.update(aligned_sentence.words)
|
||||
src_vocab.update(aligned_sentence.mots)
|
||||
# Add the NULL token
|
||||
src_vocab.add(None)
|
||||
|
||||
self.src_vocab = src_vocab
|
||||
"""
|
||||
set(str): All source language words used in training
|
||||
"""
|
||||
|
||||
self.trg_vocab = trg_vocab
|
||||
"""
|
||||
set(str): All target language words used in training
|
||||
"""
|
||||
|
||||
def sample(self, sentence_pair):
|
||||
"""
|
||||
Sample the most probable alignments from the entire alignment
|
||||
space
|
||||
|
||||
First, determine the best alignment according to IBM Model 2.
|
||||
With this initial alignment, use hill climbing to determine the
|
||||
best alignment according to a higher IBM Model. Add this
|
||||
alignment and its neighbors to the sample set. Repeat this
|
||||
process with other initial alignments obtained by pegging an
|
||||
alignment point.
|
||||
|
||||
Hill climbing may be stuck in a local maxima, hence the pegging
|
||||
and trying out of different alignments.
|
||||
|
||||
:param sentence_pair: Source and target language sentence pair
|
||||
to generate a sample of alignments from
|
||||
:type sentence_pair: AlignedSent
|
||||
|
||||
:return: A set of best alignments represented by their ``AlignmentInfo``
|
||||
and the best alignment of the set for convenience
|
||||
:rtype: set(AlignmentInfo), AlignmentInfo
|
||||
"""
|
||||
sampled_alignments = set()
|
||||
l = len(sentence_pair.mots)
|
||||
m = len(sentence_pair.words)
|
||||
|
||||
# Start from the best model 2 alignment
|
||||
initial_alignment = self.best_model2_alignment(sentence_pair)
|
||||
potential_alignment = self.hillclimb(initial_alignment)
|
||||
sampled_alignments.update(self.neighboring(potential_alignment))
|
||||
best_alignment = potential_alignment
|
||||
|
||||
# Start from other model 2 alignments,
|
||||
# with the constraint that j is aligned (pegged) to i
|
||||
for j in range(1, m + 1):
|
||||
for i in range(0, l + 1):
|
||||
initial_alignment = self.best_model2_alignment(sentence_pair, j, i)
|
||||
potential_alignment = self.hillclimb(initial_alignment, j)
|
||||
neighbors = self.neighboring(potential_alignment, j)
|
||||
sampled_alignments.update(neighbors)
|
||||
if potential_alignment.score > best_alignment.score:
|
||||
best_alignment = potential_alignment
|
||||
|
||||
return sampled_alignments, best_alignment
|
||||
|
||||
def best_model2_alignment(self, sentence_pair, j_pegged=None, i_pegged=0):
|
||||
"""
|
||||
Finds the best alignment according to IBM Model 2
|
||||
|
||||
Used as a starting point for hill climbing in Models 3 and
|
||||
above, because it is easier to compute than the best alignments
|
||||
in higher models
|
||||
|
||||
:param sentence_pair: Source and target language sentence pair
|
||||
to be word-aligned
|
||||
:type sentence_pair: AlignedSent
|
||||
|
||||
:param j_pegged: If specified, the alignment point of j_pegged
|
||||
will be fixed to i_pegged
|
||||
:type j_pegged: int
|
||||
|
||||
:param i_pegged: Alignment point to j_pegged
|
||||
:type i_pegged: int
|
||||
"""
|
||||
src_sentence = [None] + sentence_pair.mots
|
||||
trg_sentence = ["UNUSED"] + sentence_pair.words # 1-indexed
|
||||
|
||||
l = len(src_sentence) - 1 # exclude NULL
|
||||
m = len(trg_sentence) - 1
|
||||
|
||||
alignment = [0] * (m + 1) # init all alignments to NULL
|
||||
cepts = [[] for i in range(l + 1)] # init all cepts to empty list
|
||||
|
||||
for j in range(1, m + 1):
|
||||
if j == j_pegged:
|
||||
# use the pegged alignment instead of searching for best one
|
||||
best_i = i_pegged
|
||||
else:
|
||||
best_i = 0
|
||||
max_alignment_prob = IBMModel.MIN_PROB
|
||||
t = trg_sentence[j]
|
||||
|
||||
for i in range(0, l + 1):
|
||||
s = src_sentence[i]
|
||||
alignment_prob = (
|
||||
self.translation_table[t][s] * self.alignment_table[i][j][l][m]
|
||||
)
|
||||
|
||||
if alignment_prob >= max_alignment_prob:
|
||||
max_alignment_prob = alignment_prob
|
||||
best_i = i
|
||||
|
||||
alignment[j] = best_i
|
||||
cepts[best_i].append(j)
|
||||
|
||||
return AlignmentInfo(
|
||||
tuple(alignment), tuple(src_sentence), tuple(trg_sentence), cepts
|
||||
)
|
||||
|
||||
def hillclimb(self, alignment_info, j_pegged=None):
|
||||
"""
|
||||
Starting from the alignment in ``alignment_info``, look at
|
||||
neighboring alignments iteratively for the best one
|
||||
|
||||
There is no guarantee that the best alignment in the alignment
|
||||
space will be found, because the algorithm might be stuck in a
|
||||
local maximum.
|
||||
|
||||
:param j_pegged: If specified, the search will be constrained to
|
||||
alignments where ``j_pegged`` remains unchanged
|
||||
:type j_pegged: int
|
||||
|
||||
:return: The best alignment found from hill climbing
|
||||
:rtype: AlignmentInfo
|
||||
"""
|
||||
alignment = alignment_info # alias with shorter name
|
||||
max_probability = self.prob_t_a_given_s(alignment)
|
||||
|
||||
while True:
|
||||
old_alignment = alignment
|
||||
for neighbor_alignment in self.neighboring(alignment, j_pegged):
|
||||
neighbor_probability = self.prob_t_a_given_s(neighbor_alignment)
|
||||
|
||||
if neighbor_probability > max_probability:
|
||||
alignment = neighbor_alignment
|
||||
max_probability = neighbor_probability
|
||||
|
||||
if alignment == old_alignment:
|
||||
# Until there are no better alignments
|
||||
break
|
||||
|
||||
alignment.score = max_probability
|
||||
return alignment
|
||||
|
||||
def neighboring(self, alignment_info, j_pegged=None):
|
||||
"""
|
||||
Determine the neighbors of ``alignment_info``, obtained by
|
||||
moving or swapping one alignment point
|
||||
|
||||
:param j_pegged: If specified, neighbors that have a different
|
||||
alignment point from j_pegged will not be considered
|
||||
:type j_pegged: int
|
||||
|
||||
:return: A set neighboring alignments represented by their
|
||||
``AlignmentInfo``
|
||||
:rtype: set(AlignmentInfo)
|
||||
"""
|
||||
neighbors = set()
|
||||
|
||||
l = len(alignment_info.src_sentence) - 1 # exclude NULL
|
||||
m = len(alignment_info.trg_sentence) - 1
|
||||
original_alignment = alignment_info.alignment
|
||||
original_cepts = alignment_info.cepts
|
||||
|
||||
for j in range(1, m + 1):
|
||||
if j != j_pegged:
|
||||
# Add alignments that differ by one alignment point
|
||||
for i in range(0, l + 1):
|
||||
new_alignment = list(original_alignment)
|
||||
new_cepts = deepcopy(original_cepts)
|
||||
old_i = original_alignment[j]
|
||||
|
||||
# update alignment
|
||||
new_alignment[j] = i
|
||||
|
||||
# update cepts
|
||||
insort_left(new_cepts[i], j)
|
||||
new_cepts[old_i].remove(j)
|
||||
|
||||
new_alignment_info = AlignmentInfo(
|
||||
tuple(new_alignment),
|
||||
alignment_info.src_sentence,
|
||||
alignment_info.trg_sentence,
|
||||
new_cepts,
|
||||
)
|
||||
neighbors.add(new_alignment_info)
|
||||
|
||||
for j in range(1, m + 1):
|
||||
if j != j_pegged:
|
||||
# Add alignments that have two alignment points swapped
|
||||
for other_j in range(1, m + 1):
|
||||
if other_j != j_pegged and other_j != j:
|
||||
new_alignment = list(original_alignment)
|
||||
new_cepts = deepcopy(original_cepts)
|
||||
other_i = original_alignment[other_j]
|
||||
i = original_alignment[j]
|
||||
|
||||
# update alignments
|
||||
new_alignment[j] = other_i
|
||||
new_alignment[other_j] = i
|
||||
|
||||
# update cepts
|
||||
new_cepts[other_i].remove(other_j)
|
||||
insort_left(new_cepts[other_i], j)
|
||||
new_cepts[i].remove(j)
|
||||
insort_left(new_cepts[i], other_j)
|
||||
|
||||
new_alignment_info = AlignmentInfo(
|
||||
tuple(new_alignment),
|
||||
alignment_info.src_sentence,
|
||||
alignment_info.trg_sentence,
|
||||
new_cepts,
|
||||
)
|
||||
neighbors.add(new_alignment_info)
|
||||
|
||||
return neighbors
|
||||
|
||||
def maximize_lexical_translation_probabilities(self, counts):
|
||||
for t, src_words in counts.t_given_s.items():
|
||||
for s in src_words:
|
||||
estimate = counts.t_given_s[t][s] / counts.any_t_given_s[s]
|
||||
self.translation_table[t][s] = max(estimate, IBMModel.MIN_PROB)
|
||||
|
||||
def maximize_fertility_probabilities(self, counts):
|
||||
for phi, src_words in counts.fertility.items():
|
||||
for s in src_words:
|
||||
estimate = counts.fertility[phi][s] / counts.fertility_for_any_phi[s]
|
||||
self.fertility_table[phi][s] = max(estimate, IBMModel.MIN_PROB)
|
||||
|
||||
def maximize_null_generation_probabilities(self, counts):
|
||||
p1_estimate = counts.p1 / (counts.p1 + counts.p0)
|
||||
p1_estimate = max(p1_estimate, IBMModel.MIN_PROB)
|
||||
# Clip p1 if it is too large, because p0 = 1 - p1 should not be
|
||||
# smaller than MIN_PROB
|
||||
self.p1 = min(p1_estimate, 1 - IBMModel.MIN_PROB)
|
||||
|
||||
def prob_of_alignments(self, alignments):
|
||||
probability = 0
|
||||
for alignment_info in alignments:
|
||||
probability += self.prob_t_a_given_s(alignment_info)
|
||||
return probability
|
||||
|
||||
def prob_t_a_given_s(self, alignment_info):
|
||||
"""
|
||||
Probability of target sentence and an alignment given the
|
||||
source sentence
|
||||
|
||||
All required information is assumed to be in ``alignment_info``
|
||||
and self.
|
||||
|
||||
Derived classes should override this method
|
||||
"""
|
||||
return 0.0
|
||||
|
||||
|
||||
class AlignmentInfo:
|
||||
"""
|
||||
Helper data object for training IBM Models 3 and up
|
||||
|
||||
Read-only. For a source sentence and its counterpart in the target
|
||||
language, this class holds information about the sentence pair's
|
||||
alignment, cepts, and fertility.
|
||||
|
||||
Warning: Alignments are one-indexed here, in contrast to
|
||||
nltk.translate.Alignment and AlignedSent, which are zero-indexed
|
||||
This class is not meant to be used outside of IBM models.
|
||||
"""
|
||||
|
||||
def __init__(self, alignment, src_sentence, trg_sentence, cepts):
|
||||
if not isinstance(alignment, tuple):
|
||||
raise TypeError(
|
||||
"The alignment must be a tuple because it is used "
|
||||
"to uniquely identify AlignmentInfo objects."
|
||||
)
|
||||
|
||||
self.alignment = alignment
|
||||
"""
|
||||
tuple(int): Alignment function. ``alignment[j]`` is the position
|
||||
in the source sentence that is aligned to the position j in the
|
||||
target sentence.
|
||||
"""
|
||||
|
||||
self.src_sentence = src_sentence
|
||||
"""
|
||||
tuple(str): Source sentence referred to by this object.
|
||||
Should include NULL token (None) in index 0.
|
||||
"""
|
||||
|
||||
self.trg_sentence = trg_sentence
|
||||
"""
|
||||
tuple(str): Target sentence referred to by this object.
|
||||
Should have a dummy element in index 0 so that the first word
|
||||
starts from index 1.
|
||||
"""
|
||||
|
||||
self.cepts = cepts
|
||||
"""
|
||||
list(list(int)): The positions of the target words, in
|
||||
ascending order, aligned to a source word position. For example,
|
||||
cepts[4] = (2, 3, 7) means that words in positions 2, 3 and 7
|
||||
of the target sentence are aligned to the word in position 4 of
|
||||
the source sentence
|
||||
"""
|
||||
|
||||
self.score = None
|
||||
"""
|
||||
float: Optional. Probability of alignment, as defined by the
|
||||
IBM model that assesses this alignment
|
||||
"""
|
||||
|
||||
def fertility_of_i(self, i):
|
||||
"""
|
||||
Fertility of word in position ``i`` of the source sentence
|
||||
"""
|
||||
return len(self.cepts[i])
|
||||
|
||||
def is_head_word(self, j):
|
||||
"""
|
||||
:return: Whether the word in position ``j`` of the target
|
||||
sentence is a head word
|
||||
"""
|
||||
i = self.alignment[j]
|
||||
return self.cepts[i][0] == j
|
||||
|
||||
def center_of_cept(self, i):
|
||||
"""
|
||||
:return: The ceiling of the average positions of the words in
|
||||
the tablet of cept ``i``, or 0 if ``i`` is None
|
||||
"""
|
||||
if i is None:
|
||||
return 0
|
||||
|
||||
average_position = sum(self.cepts[i]) / len(self.cepts[i])
|
||||
return int(ceil(average_position))
|
||||
|
||||
def previous_cept(self, j):
|
||||
"""
|
||||
:return: The previous cept of ``j``, or None if ``j`` belongs to
|
||||
the first cept
|
||||
"""
|
||||
i = self.alignment[j]
|
||||
if i == 0:
|
||||
raise ValueError(
|
||||
"Words aligned to NULL cannot have a previous "
|
||||
"cept because NULL has no position"
|
||||
)
|
||||
previous_cept = i - 1
|
||||
while previous_cept > 0 and self.fertility_of_i(previous_cept) == 0:
|
||||
previous_cept -= 1
|
||||
|
||||
if previous_cept <= 0:
|
||||
previous_cept = None
|
||||
return previous_cept
|
||||
|
||||
def previous_in_tablet(self, j):
|
||||
"""
|
||||
:return: The position of the previous word that is in the same
|
||||
tablet as ``j``, or None if ``j`` is the first word of the
|
||||
tablet
|
||||
"""
|
||||
i = self.alignment[j]
|
||||
tablet_position = self.cepts[i].index(j)
|
||||
if tablet_position == 0:
|
||||
return None
|
||||
return self.cepts[i][tablet_position - 1]
|
||||
|
||||
def zero_indexed_alignment(self):
|
||||
"""
|
||||
:return: Zero-indexed alignment, suitable for use in external
|
||||
``nltk.translate`` modules like ``nltk.translate.Alignment``
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
zero_indexed_alignment = []
|
||||
for j in range(1, len(self.trg_sentence)):
|
||||
i = self.alignment[j] - 1
|
||||
if i < 0:
|
||||
i = None # alignment to NULL token
|
||||
zero_indexed_alignment.append((j - 1, i))
|
||||
return zero_indexed_alignment
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.alignment == other.alignment
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.alignment)
|
||||
|
||||
|
||||
class Counts:
|
||||
"""
|
||||
Data object to store counts of various parameters during training
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.t_given_s = defaultdict(lambda: defaultdict(float))
|
||||
self.any_t_given_s = defaultdict(float)
|
||||
self.p0 = 0.0
|
||||
self.p1 = 0.0
|
||||
self.fertility = defaultdict(lambda: defaultdict(float))
|
||||
self.fertility_for_any_phi = defaultdict(float)
|
||||
|
||||
def update_lexical_translation(self, count, alignment_info, j):
|
||||
i = alignment_info.alignment[j]
|
||||
t = alignment_info.trg_sentence[j]
|
||||
s = alignment_info.src_sentence[i]
|
||||
self.t_given_s[t][s] += count
|
||||
self.any_t_given_s[s] += count
|
||||
|
||||
def update_null_generation(self, count, alignment_info):
|
||||
m = len(alignment_info.trg_sentence) - 1
|
||||
fertility_of_null = alignment_info.fertility_of_i(0)
|
||||
self.p1 += fertility_of_null * count
|
||||
self.p0 += (m - 2 * fertility_of_null) * count
|
||||
|
||||
def update_fertility(self, count, alignment_info):
|
||||
for i in range(0, len(alignment_info.src_sentence)):
|
||||
s = alignment_info.src_sentence[i]
|
||||
phi = alignment_info.fertility_of_i(i)
|
||||
self.fertility[phi][s] += count
|
||||
self.fertility_for_any_phi[s] += count
|
||||
332
backend/venv/Lib/site-packages/nltk/translate/lepor.py
Normal file
332
backend/venv/Lib/site-packages/nltk/translate/lepor.py
Normal file
@@ -0,0 +1,332 @@
|
||||
# Natural Language Toolkit: LEPOR Score
|
||||
#
|
||||
# Copyright (C) 2001-2023 NLTK Project
|
||||
# Author: Ikram Ul Haq (ulhaqi12)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""LEPOR score implementation."""
|
||||
|
||||
import math
|
||||
import re
|
||||
import sys
|
||||
from typing import Callable, List
|
||||
|
||||
import nltk
|
||||
|
||||
|
||||
def length_penalty(reference: List[str], hypothesis: List[str]) -> float:
|
||||
"""
|
||||
This function calculates the length penalty(LP) for the LEPOR metric, which is defined to embrace the penaltyvfor
|
||||
both longer and shorter hypothesis compared with the reference translations.
|
||||
Refer from Eq (2) on https://aclanthology.org/C12-2044
|
||||
|
||||
:param reference: Reference sentence
|
||||
:type reference: str
|
||||
:param hypothesis: Hypothesis sentence
|
||||
:type hypothesis: str
|
||||
|
||||
:return: Penalty of difference in length in reference and hypothesis sentence.
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
ref_len = len(reference)
|
||||
hyp_len = len(hypothesis)
|
||||
|
||||
if ref_len == hyp_len:
|
||||
return 1
|
||||
elif ref_len < hyp_len:
|
||||
return math.exp(1 - (ref_len / hyp_len))
|
||||
else: # i.e. r_len > hyp_len
|
||||
return math.exp(1 - (hyp_len / ref_len))
|
||||
|
||||
|
||||
def alignment(ref_tokens: List[str], hyp_tokens: List[str]):
|
||||
"""
|
||||
This function computes the context-dependent n-gram word alignment tasks that
|
||||
takes into account the surrounding context (neighbouring words) of the potential
|
||||
word to select a better matching pairs between the output and the reference.
|
||||
|
||||
This alignment task is used to compute the ngram positional difference penalty
|
||||
component of the LEPOR score. Generally, the function finds the matching tokens
|
||||
between the reference and hypothesis, then find the indices of longest matching
|
||||
n-grams by checking the left and right unigram window of the matching tokens.
|
||||
|
||||
:param ref_tokens: A list of tokens in reference sentence.
|
||||
:type ref_tokens: List[str]
|
||||
:param hyp_tokens: A list of tokens in hypothesis sentence.
|
||||
:type hyp_tokens: List[str]
|
||||
"""
|
||||
alignments = []
|
||||
|
||||
# Store the reference and hypothesis tokens length.
|
||||
hyp_len = len(hyp_tokens)
|
||||
ref_len = len(ref_tokens)
|
||||
|
||||
for hyp_index, hyp_token in enumerate(hyp_tokens):
|
||||
# If no match.
|
||||
if ref_tokens.count(hyp_token) == 0:
|
||||
alignments.append(-1)
|
||||
# If only one match.
|
||||
elif ref_tokens.count(hyp_token) == 1:
|
||||
alignments.append(ref_tokens.index(hyp_token))
|
||||
# Otherwise, compute the multiple possibilities.
|
||||
else:
|
||||
# Keeps an index of where the hypothesis token matches the reference.
|
||||
ref_indexes = [
|
||||
i for i, ref_token in enumerate(ref_tokens) if ref_token == hyp_token
|
||||
]
|
||||
|
||||
# Iterate through the matched tokens, and check if
|
||||
# the one token to the left/right also matches.
|
||||
is_matched = []
|
||||
for ind, ref_index in enumerate(ref_indexes):
|
||||
# The one to the left token also matches.
|
||||
if (
|
||||
0 < ref_index - 1 < ref_len
|
||||
and 0 < hyp_index - 1 < hyp_len
|
||||
and ref_tokens[ref_index - 1] == hyp_tokens[hyp_index - 1]
|
||||
):
|
||||
is_matched[ind] = True
|
||||
# The one to the right token also matches.
|
||||
elif (
|
||||
0 < ref_index + 1 < ref_len
|
||||
and 0 < hyp_index + 1 < hyp_len
|
||||
and ref_tokens[ref_index + 1] == hyp_tokens[hyp_index + 1]
|
||||
):
|
||||
is_matched[ind] = True
|
||||
# If the left and right tokens don't match.
|
||||
else:
|
||||
is_matched[ind] = False
|
||||
|
||||
# Stores the alignments that have matching phrases.
|
||||
# If there's only a single matched alignment.
|
||||
if is_matched.count(True) == 1:
|
||||
alignments.append(ref_indexes[is_matched.index(True)])
|
||||
# If there's multiple matched alignments that have matching
|
||||
# tokens in the left/right window, we shift the index of the
|
||||
# alignment to the right most matching token.
|
||||
elif is_matched.count(True) > 1:
|
||||
min_distance = 0
|
||||
min_index = 0
|
||||
for match, ref_index in zip(is_matched, ref_indexes):
|
||||
if match:
|
||||
distance = abs(hyp_index - ref_index)
|
||||
if distance > min_distance:
|
||||
min_distance = distance
|
||||
min_index = ref_index
|
||||
alignments.append(min_index)
|
||||
# If there's no matched alignments,
|
||||
# we still keep indexes of the matching tokens
|
||||
# without explicitly checking for the left/right window.
|
||||
else:
|
||||
min_distance = 0
|
||||
min_index = 0
|
||||
for ref_index in ref_indexes:
|
||||
distance = abs(hyp_index - ref_index)
|
||||
if distance > min_distance:
|
||||
min_distance = distance
|
||||
min_index = ref_index
|
||||
alignments.append(min_index)
|
||||
|
||||
for ref_index in ref_indexes:
|
||||
distance = abs(hyp_index - ref_index)
|
||||
if distance > min_distance:
|
||||
min_distance = distance
|
||||
min_index = ref_index
|
||||
alignments.append(min_index)
|
||||
|
||||
# The alignments are one indexed to keep track of the ending slice pointer of the matching ngrams.
|
||||
alignments = [a + 1 for a in alignments if a != -1]
|
||||
return alignments
|
||||
|
||||
|
||||
def ngram_positional_penalty(
|
||||
ref_tokens: List[str], hyp_tokens: List[str]
|
||||
) -> (float, float):
|
||||
"""
|
||||
This function calculates the n-gram position difference penalty (NPosPenal) described in the LEPOR paper.
|
||||
The NPosPenal is an exponential of the length normalized n-gram matches between the reference and the hypothesis.
|
||||
|
||||
:param ref_tokens: A list of words in reference sentence.
|
||||
:type ref_tokens: List[str]
|
||||
:param hyp_tokens: A list of words in hypothesis sentence.
|
||||
:type hyp_tokens: List[str]
|
||||
|
||||
:return: A tuple containing two elements:
|
||||
- NPosPenal: N-gram positional penalty.
|
||||
- match_count: Count of matched n-grams.
|
||||
:rtype: tuple
|
||||
"""
|
||||
|
||||
alignments = alignment(ref_tokens, hyp_tokens)
|
||||
match_count = len(alignments)
|
||||
|
||||
# Stores the n-gram position values (difference values) of aligned words
|
||||
# between output and reference sentences,
|
||||
# aka |PD| of eq (4) in https://aclanthology.org/C12-2044
|
||||
pd = []
|
||||
for i, a in enumerate(alignments):
|
||||
pd.append(abs((i + 1) / len(hyp_tokens) - a / len(ref_tokens)))
|
||||
|
||||
npd = sum(pd) / len(hyp_tokens)
|
||||
return math.exp(-npd), match_count
|
||||
|
||||
|
||||
def harmonic(
|
||||
match_count: int,
|
||||
reference_length: int,
|
||||
hypothesis_length: int,
|
||||
alpha: float,
|
||||
beta: float,
|
||||
) -> float:
|
||||
"""
|
||||
Function will calculate the precision and recall of matched words and calculate a final score on wighting
|
||||
using alpha and beta parameters.
|
||||
|
||||
:param match_count: Number of words in hypothesis aligned with reference.
|
||||
:type match_count: int
|
||||
:param reference_length: Length of the reference sentence
|
||||
:type reference_length: int
|
||||
:param hypothesis_length: Length of the hypothesis sentence
|
||||
:type hypothesis_length: int
|
||||
:param alpha: A parameter to set weight fot recall.
|
||||
:type alpha: float
|
||||
:param beta: A parameter to set weight fot precision.
|
||||
:type beta: float
|
||||
|
||||
:return: Harmonic mean.
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
epsilon = sys.float_info.epsilon
|
||||
|
||||
precision = match_count / hypothesis_length
|
||||
recall = match_count / reference_length
|
||||
|
||||
harmonic_score = (alpha + beta) / (
|
||||
(alpha / (recall + epsilon)) + (beta / (precision + epsilon))
|
||||
)
|
||||
|
||||
return harmonic_score
|
||||
|
||||
|
||||
def sentence_lepor(
|
||||
references: List[str],
|
||||
hypothesis: str,
|
||||
alpha: float = 1.0,
|
||||
beta: float = 1.0,
|
||||
tokenizer: Callable[[str], List[str]] = None,
|
||||
) -> List[float]:
|
||||
"""
|
||||
Calculate LEPOR score a sentence from Han, A. L.-F. (2017).
|
||||
LEPOR: An Augmented Machine Translation Evaluation Metric. https://arxiv.org/abs/1703.08748v2
|
||||
|
||||
>>> hypothesis = 'a bird is on a stone.'
|
||||
|
||||
>>> reference1 = 'a bird behind the stone.'
|
||||
>>> reference2 = 'a bird is on the rock.'
|
||||
|
||||
>>> sentence_lepor([reference1, reference2], hypothesis)
|
||||
[0.7824248013113159, 0.7739937377760259]
|
||||
|
||||
:param references: Reference sentences
|
||||
:type references: list(str)
|
||||
:param hypothesis: Hypothesis sentence
|
||||
:type hypothesis: str
|
||||
:param alpha: A parameter to set weight fot recall.
|
||||
:type alpha: float
|
||||
:param beta: A parameter to set weight fot precision.
|
||||
:type beta: float
|
||||
:param tokenizer: A callable tokenizer that will accept a string and returns a list of tokens.
|
||||
:type tokenizer: Callable[[str], List[str]]
|
||||
|
||||
:return: The list of Lepor scores for a hypothesis with all references.
|
||||
:rtype: list(float)
|
||||
|
||||
"""
|
||||
|
||||
lepor_scores = list()
|
||||
|
||||
# Tokenize sentences.
|
||||
if tokenizer:
|
||||
hypothesis = tokenizer(hypothesis)
|
||||
for index, reference in enumerate(references):
|
||||
references[index] = tokenizer(reference)
|
||||
|
||||
else: # If tokenizer is not provided, use the one in NLTK.
|
||||
hypothesis = nltk.word_tokenize(hypothesis)
|
||||
for index, reference in enumerate(references):
|
||||
references[index] = nltk.word_tokenize(reference)
|
||||
|
||||
for reference in references:
|
||||
if len(reference) == 0 or len(hypothesis) == 0:
|
||||
raise ValueError("One of the sentence is empty. Exit.")
|
||||
|
||||
# Calculate the length penalty due to the difference in the length of reference and hypothesis.
|
||||
lp = length_penalty(reference, hypothesis)
|
||||
|
||||
# Calculate the penalty on different positions of same word in translation.
|
||||
npd, match_count = ngram_positional_penalty(reference, hypothesis)
|
||||
|
||||
harmonic_score = harmonic(
|
||||
match_count, len(reference), len(hypothesis), alpha, beta
|
||||
)
|
||||
|
||||
lepor_scores.append(lp * npd * harmonic_score)
|
||||
|
||||
return lepor_scores
|
||||
|
||||
|
||||
def corpus_lepor(
|
||||
references: List[List[str]],
|
||||
hypothesis: List[str],
|
||||
alpha: float = 1.0,
|
||||
beta: float = 1.0,
|
||||
tokenizer: Callable[[str], List[str]] = None,
|
||||
) -> List[List[float]]:
|
||||
"""
|
||||
Calculate LEPOR score for list of sentences from Han, A. L.-F. (2017).
|
||||
LEPOR: An Augmented Machine Translation Evaluation Metric. https://arxiv.org/abs/1703.08748v2
|
||||
|
||||
>>> hypothesis = ['a bird is on a stone.', 'scary crow was not bad.']
|
||||
|
||||
>>> references = [['a bird behind the stone.', 'a bird is on the rock'],
|
||||
... ['scary cow was good.', 'scary crow was elegant.']]
|
||||
|
||||
>>> corpus_lepor(references, hypothesis)
|
||||
[[0.7824248013113159, 0.7931427828105261], [0.5639427891892225, 0.7860963170056643]]
|
||||
|
||||
|
||||
:param references: Reference sentences
|
||||
:type references: list(list(str))
|
||||
:param hypothesis: Hypothesis sentences
|
||||
:type hypothesis: list(str)
|
||||
:param alpha: A parameter to set weight fot recall.
|
||||
:type alpha: float
|
||||
:param beta: A parameter to set weight fot precision.
|
||||
:type beta: float
|
||||
:param tokenizer: A callable tokenizer that will accept a string and returns a list of tokens.
|
||||
:type tokenizer: Callable[[str], List[str]]
|
||||
|
||||
:return: The Lepor score. Returns a list for all sentences
|
||||
:rtype: list(list(float))
|
||||
|
||||
"""
|
||||
|
||||
if len(references) == 0 or len(hypothesis) == 0:
|
||||
raise ValueError("There is an Empty list. Exit.")
|
||||
|
||||
assert len(references) == len(hypothesis), (
|
||||
"The number of hypothesis and their reference(s) should be the " "same "
|
||||
)
|
||||
|
||||
lepor_scores = list()
|
||||
|
||||
for reference_sen, hypothesis_sen in zip(references, hypothesis):
|
||||
# Calculate Lepor for each sentence separately and append in a list.
|
||||
lepor_scores.append(
|
||||
sentence_lepor(reference_sen, hypothesis_sen, alpha, beta, tokenizer)
|
||||
)
|
||||
|
||||
return lepor_scores
|
||||
409
backend/venv/Lib/site-packages/nltk/translate/meteor_score.py
Normal file
409
backend/venv/Lib/site-packages/nltk/translate/meteor_score.py
Normal file
@@ -0,0 +1,409 @@
|
||||
# Natural Language Toolkit: Machine Translation
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Uday Krishna <udaykrishna5@gmail.com>
|
||||
# Contributor: Tom Aarsen
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
from itertools import chain, product
|
||||
from typing import Callable, Iterable, List, Tuple
|
||||
|
||||
from nltk.corpus import WordNetCorpusReader, wordnet
|
||||
from nltk.stem.api import StemmerI
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
|
||||
|
||||
def _generate_enums(
|
||||
hypothesis: Iterable[str],
|
||||
reference: Iterable[str],
|
||||
preprocess: Callable[[str], str] = str.lower,
|
||||
) -> Tuple[List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
Takes in pre-tokenized inputs for hypothesis and reference and returns
|
||||
enumerated word lists for each of them
|
||||
|
||||
:param hypothesis: pre-tokenized hypothesis
|
||||
:param reference: pre-tokenized reference
|
||||
:preprocess: preprocessing method (default str.lower)
|
||||
:return: enumerated words list
|
||||
"""
|
||||
if isinstance(hypothesis, str):
|
||||
raise TypeError(
|
||||
f'"hypothesis" expects pre-tokenized hypothesis (Iterable[str]): {hypothesis}'
|
||||
)
|
||||
|
||||
if isinstance(reference, str):
|
||||
raise TypeError(
|
||||
f'"reference" expects pre-tokenized reference (Iterable[str]): {reference}'
|
||||
)
|
||||
|
||||
enum_hypothesis_list = list(enumerate(map(preprocess, hypothesis)))
|
||||
enum_reference_list = list(enumerate(map(preprocess, reference)))
|
||||
return enum_hypothesis_list, enum_reference_list
|
||||
|
||||
|
||||
def exact_match(
|
||||
hypothesis: Iterable[str], reference: Iterable[str]
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
matches exact words in hypothesis and reference
|
||||
and returns a word mapping based on the enumerated
|
||||
word id between hypothesis and reference
|
||||
|
||||
:param hypothesis: pre-tokenized hypothesis
|
||||
:param reference: pre-tokenized reference
|
||||
:return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
|
||||
enumerated unmatched reference tuples
|
||||
"""
|
||||
enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
|
||||
return _match_enums(enum_hypothesis_list, enum_reference_list)
|
||||
|
||||
|
||||
def _match_enums(
|
||||
enum_hypothesis_list: List[Tuple[int, str]],
|
||||
enum_reference_list: List[Tuple[int, str]],
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
matches exact words in hypothesis and reference and returns
|
||||
a word mapping between enum_hypothesis_list and enum_reference_list
|
||||
based on the enumerated word id.
|
||||
|
||||
:param enum_hypothesis_list: enumerated hypothesis list
|
||||
:param enum_reference_list: enumerated reference list
|
||||
:return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
|
||||
enumerated unmatched reference tuples
|
||||
"""
|
||||
word_match = []
|
||||
for i in range(len(enum_hypothesis_list))[::-1]:
|
||||
for j in range(len(enum_reference_list))[::-1]:
|
||||
if enum_hypothesis_list[i][1] == enum_reference_list[j][1]:
|
||||
word_match.append(
|
||||
(enum_hypothesis_list[i][0], enum_reference_list[j][0])
|
||||
)
|
||||
enum_hypothesis_list.pop(i)
|
||||
enum_reference_list.pop(j)
|
||||
break
|
||||
return word_match, enum_hypothesis_list, enum_reference_list
|
||||
|
||||
|
||||
def _enum_stem_match(
|
||||
enum_hypothesis_list: List[Tuple[int, str]],
|
||||
enum_reference_list: List[Tuple[int, str]],
|
||||
stemmer: StemmerI = PorterStemmer(),
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
Stems each word and matches them in hypothesis and reference
|
||||
and returns a word mapping between enum_hypothesis_list and
|
||||
enum_reference_list based on the enumerated word id. The function also
|
||||
returns a enumerated list of unmatched words for hypothesis and reference.
|
||||
|
||||
:param enum_hypothesis_list: enumerated hypothesis list
|
||||
:param enum_reference_list: enumerated reference list
|
||||
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
|
||||
:return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
|
||||
enumerated unmatched reference tuples
|
||||
"""
|
||||
stemmed_enum_hypothesis_list = [
|
||||
(word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_hypothesis_list
|
||||
]
|
||||
|
||||
stemmed_enum_reference_list = [
|
||||
(word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_reference_list
|
||||
]
|
||||
|
||||
return _match_enums(stemmed_enum_hypothesis_list, stemmed_enum_reference_list)
|
||||
|
||||
|
||||
def stem_match(
|
||||
hypothesis: Iterable[str],
|
||||
reference: Iterable[str],
|
||||
stemmer: StemmerI = PorterStemmer(),
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
Stems each word and matches them in hypothesis and reference
|
||||
and returns a word mapping between hypothesis and reference
|
||||
|
||||
:param hypothesis: pre-tokenized hypothesis
|
||||
:param reference: pre-tokenized reference
|
||||
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
|
||||
:return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
|
||||
enumerated unmatched reference tuples
|
||||
"""
|
||||
enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
|
||||
return _enum_stem_match(enum_hypothesis_list, enum_reference_list, stemmer=stemmer)
|
||||
|
||||
|
||||
def _enum_wordnetsyn_match(
|
||||
enum_hypothesis_list: List[Tuple[int, str]],
|
||||
enum_reference_list: List[Tuple[int, str]],
|
||||
wordnet: WordNetCorpusReader = wordnet,
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
Matches each word in reference to a word in hypothesis
|
||||
if any synonym of a hypothesis word is the exact match
|
||||
to the reference word.
|
||||
|
||||
:param enum_hypothesis_list: enumerated hypothesis list
|
||||
:param enum_reference_list: enumerated reference list
|
||||
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
|
||||
"""
|
||||
word_match = []
|
||||
for i in range(len(enum_hypothesis_list))[::-1]:
|
||||
hypothesis_syns = set(
|
||||
chain.from_iterable(
|
||||
(
|
||||
lemma.name()
|
||||
for lemma in synset.lemmas()
|
||||
if lemma.name().find("_") < 0
|
||||
)
|
||||
for synset in wordnet.synsets(enum_hypothesis_list[i][1])
|
||||
)
|
||||
).union({enum_hypothesis_list[i][1]})
|
||||
for j in range(len(enum_reference_list))[::-1]:
|
||||
if enum_reference_list[j][1] in hypothesis_syns:
|
||||
word_match.append(
|
||||
(enum_hypothesis_list[i][0], enum_reference_list[j][0])
|
||||
)
|
||||
enum_hypothesis_list.pop(i)
|
||||
enum_reference_list.pop(j)
|
||||
break
|
||||
return word_match, enum_hypothesis_list, enum_reference_list
|
||||
|
||||
|
||||
def wordnetsyn_match(
|
||||
hypothesis: Iterable[str],
|
||||
reference: Iterable[str],
|
||||
wordnet: WordNetCorpusReader = wordnet,
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
Matches each word in reference to a word in hypothesis if any synonym
|
||||
of a hypothesis word is the exact match to the reference word.
|
||||
|
||||
:param hypothesis: pre-tokenized hypothesis
|
||||
:param reference: pre-tokenized reference
|
||||
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
|
||||
:return: list of mapped tuples
|
||||
"""
|
||||
enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
|
||||
return _enum_wordnetsyn_match(
|
||||
enum_hypothesis_list, enum_reference_list, wordnet=wordnet
|
||||
)
|
||||
|
||||
|
||||
def _enum_align_words(
|
||||
enum_hypothesis_list: List[Tuple[int, str]],
|
||||
enum_reference_list: List[Tuple[int, str]],
|
||||
stemmer: StemmerI = PorterStemmer(),
|
||||
wordnet: WordNetCorpusReader = wordnet,
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
Aligns/matches words in the hypothesis to reference by sequentially
|
||||
applying exact match, stemmed match and wordnet based synonym match.
|
||||
in case there are multiple matches the match which has the least number
|
||||
of crossing is chosen. Takes enumerated list as input instead of
|
||||
string input
|
||||
|
||||
:param enum_hypothesis_list: enumerated hypothesis list
|
||||
:param enum_reference_list: enumerated reference list
|
||||
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
|
||||
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
|
||||
:return: sorted list of matched tuples, unmatched hypothesis list,
|
||||
unmatched reference list
|
||||
"""
|
||||
exact_matches, enum_hypothesis_list, enum_reference_list = _match_enums(
|
||||
enum_hypothesis_list, enum_reference_list
|
||||
)
|
||||
|
||||
stem_matches, enum_hypothesis_list, enum_reference_list = _enum_stem_match(
|
||||
enum_hypothesis_list, enum_reference_list, stemmer=stemmer
|
||||
)
|
||||
|
||||
wns_matches, enum_hypothesis_list, enum_reference_list = _enum_wordnetsyn_match(
|
||||
enum_hypothesis_list, enum_reference_list, wordnet=wordnet
|
||||
)
|
||||
|
||||
return (
|
||||
sorted(
|
||||
exact_matches + stem_matches + wns_matches, key=lambda wordpair: wordpair[0]
|
||||
),
|
||||
enum_hypothesis_list,
|
||||
enum_reference_list,
|
||||
)
|
||||
|
||||
|
||||
def align_words(
|
||||
hypothesis: Iterable[str],
|
||||
reference: Iterable[str],
|
||||
stemmer: StemmerI = PorterStemmer(),
|
||||
wordnet: WordNetCorpusReader = wordnet,
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
Aligns/matches words in the hypothesis to reference by sequentially
|
||||
applying exact match, stemmed match and wordnet based synonym match.
|
||||
In case there are multiple matches the match which has the least number
|
||||
of crossing is chosen.
|
||||
|
||||
:param hypothesis: pre-tokenized hypothesis
|
||||
:param reference: pre-tokenized reference
|
||||
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
|
||||
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
|
||||
:return: sorted list of matched tuples, unmatched hypothesis list, unmatched reference list
|
||||
"""
|
||||
enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
|
||||
return _enum_align_words(
|
||||
enum_hypothesis_list, enum_reference_list, stemmer=stemmer, wordnet=wordnet
|
||||
)
|
||||
|
||||
|
||||
def _count_chunks(matches: List[Tuple[int, int]]) -> int:
|
||||
"""
|
||||
Counts the fewest possible number of chunks such that matched unigrams
|
||||
of each chunk are adjacent to each other. This is used to calculate the
|
||||
fragmentation part of the metric.
|
||||
|
||||
:param matches: list containing a mapping of matched words (output of align_words)
|
||||
:return: Number of chunks a sentence is divided into post alignment
|
||||
"""
|
||||
i = 0
|
||||
chunks = 1
|
||||
while i < len(matches) - 1:
|
||||
if (matches[i + 1][0] == matches[i][0] + 1) and (
|
||||
matches[i + 1][1] == matches[i][1] + 1
|
||||
):
|
||||
i += 1
|
||||
continue
|
||||
i += 1
|
||||
chunks += 1
|
||||
return chunks
|
||||
|
||||
|
||||
def single_meteor_score(
|
||||
reference: Iterable[str],
|
||||
hypothesis: Iterable[str],
|
||||
preprocess: Callable[[str], str] = str.lower,
|
||||
stemmer: StemmerI = PorterStemmer(),
|
||||
wordnet: WordNetCorpusReader = wordnet,
|
||||
alpha: float = 0.9,
|
||||
beta: float = 3.0,
|
||||
gamma: float = 0.5,
|
||||
) -> float:
|
||||
"""
|
||||
Calculates METEOR score for single hypothesis and reference as per
|
||||
"Meteor: An Automatic Metric for MT Evaluation with HighLevels of
|
||||
Correlation with Human Judgments" by Alon Lavie and Abhaya Agarwal,
|
||||
in Proceedings of ACL.
|
||||
https://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf
|
||||
|
||||
|
||||
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
|
||||
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands']
|
||||
|
||||
|
||||
>>> round(single_meteor_score(reference1, hypothesis1),4)
|
||||
0.6944
|
||||
|
||||
If there is no words match during the alignment the method returns the
|
||||
score as 0. We can safely return a zero instead of raising a
|
||||
division by zero error as no match usually implies a bad translation.
|
||||
|
||||
>>> round(single_meteor_score(['this', 'is', 'a', 'cat'], ['non', 'matching', 'hypothesis']),4)
|
||||
0.0
|
||||
|
||||
:param reference: pre-tokenized reference
|
||||
:param hypothesis: pre-tokenized hypothesis
|
||||
:param preprocess: preprocessing function (default str.lower)
|
||||
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
|
||||
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
|
||||
:param alpha: parameter for controlling relative weights of precision and recall.
|
||||
:param beta: parameter for controlling shape of penalty as a
|
||||
function of as a function of fragmentation.
|
||||
:param gamma: relative weight assigned to fragmentation penalty.
|
||||
:return: The sentence-level METEOR score.
|
||||
"""
|
||||
enum_hypothesis, enum_reference = _generate_enums(
|
||||
hypothesis, reference, preprocess=preprocess
|
||||
)
|
||||
translation_length = len(enum_hypothesis)
|
||||
reference_length = len(enum_reference)
|
||||
matches, _, _ = _enum_align_words(
|
||||
enum_hypothesis, enum_reference, stemmer=stemmer, wordnet=wordnet
|
||||
)
|
||||
matches_count = len(matches)
|
||||
try:
|
||||
precision = float(matches_count) / translation_length
|
||||
recall = float(matches_count) / reference_length
|
||||
fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
|
||||
chunk_count = float(_count_chunks(matches))
|
||||
frag_frac = chunk_count / matches_count
|
||||
except ZeroDivisionError:
|
||||
return 0.0
|
||||
penalty = gamma * frag_frac**beta
|
||||
return (1 - penalty) * fmean
|
||||
|
||||
|
||||
def meteor_score(
|
||||
references: Iterable[Iterable[str]],
|
||||
hypothesis: Iterable[str],
|
||||
preprocess: Callable[[str], str] = str.lower,
|
||||
stemmer: StemmerI = PorterStemmer(),
|
||||
wordnet: WordNetCorpusReader = wordnet,
|
||||
alpha: float = 0.9,
|
||||
beta: float = 3.0,
|
||||
gamma: float = 0.5,
|
||||
) -> float:
|
||||
"""
|
||||
Calculates METEOR score for hypothesis with multiple references as
|
||||
described in "Meteor: An Automatic Metric for MT Evaluation with
|
||||
HighLevels of Correlation with Human Judgments" by Alon Lavie and
|
||||
Abhaya Agarwal, in Proceedings of ACL.
|
||||
https://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf
|
||||
|
||||
|
||||
In case of multiple references the best score is chosen. This method
|
||||
iterates over single_meteor_score and picks the best pair among all
|
||||
the references for a given hypothesis
|
||||
|
||||
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', 'forever', 'hearing', 'the', 'activity', 'guidebook', 'that', 'party', 'direct']
|
||||
|
||||
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands']
|
||||
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', 'guarantees', 'the', 'military', 'forces', 'always', 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
|
||||
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', 'army', 'always', 'to', 'heed', 'the', 'directions', 'of', 'the', 'party']
|
||||
|
||||
>>> round(meteor_score([reference1, reference2, reference3], hypothesis1),4)
|
||||
0.6944
|
||||
|
||||
If there is no words match during the alignment the method returns the
|
||||
score as 0. We can safely return a zero instead of raising a
|
||||
division by zero error as no match usually implies a bad translation.
|
||||
|
||||
>>> round(meteor_score([['this', 'is', 'a', 'cat']], ['non', 'matching', 'hypothesis']),4)
|
||||
0.0
|
||||
|
||||
:param references: pre-tokenized reference sentences
|
||||
:param hypothesis: a pre-tokenized hypothesis sentence
|
||||
:param preprocess: preprocessing function (default str.lower)
|
||||
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
|
||||
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
|
||||
:param alpha: parameter for controlling relative weights of precision and recall.
|
||||
:param beta: parameter for controlling shape of penalty as a function
|
||||
of as a function of fragmentation.
|
||||
:param gamma: relative weight assigned to fragmentation penalty.
|
||||
:return: The sentence-level METEOR score.
|
||||
"""
|
||||
return max(
|
||||
single_meteor_score(
|
||||
reference,
|
||||
hypothesis,
|
||||
preprocess=preprocess,
|
||||
stemmer=stemmer,
|
||||
wordnet=wordnet,
|
||||
alpha=alpha,
|
||||
beta=beta,
|
||||
gamma=gamma,
|
||||
)
|
||||
for reference in references
|
||||
)
|
||||
41
backend/venv/Lib/site-packages/nltk/translate/metrics.py
Normal file
41
backend/venv/Lib/site-packages/nltk/translate/metrics.py
Normal file
@@ -0,0 +1,41 @@
|
||||
# Natural Language Toolkit: Translation metrics
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Will Zhang <wilzzha@gmail.com>
|
||||
# Guan Gui <ggui@student.unimelb.edu.au>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
def alignment_error_rate(reference, hypothesis, possible=None):
|
||||
"""
|
||||
Return the Alignment Error Rate (AER) of an alignment
|
||||
with respect to a "gold standard" reference alignment.
|
||||
Return an error rate between 0.0 (perfect alignment) and 1.0 (no
|
||||
alignment).
|
||||
|
||||
>>> from nltk.translate import Alignment
|
||||
>>> ref = Alignment([(0, 0), (1, 1), (2, 2)])
|
||||
>>> test = Alignment([(0, 0), (1, 2), (2, 1)])
|
||||
>>> alignment_error_rate(ref, test) # doctest: +ELLIPSIS
|
||||
0.6666666666666667
|
||||
|
||||
:type reference: Alignment
|
||||
:param reference: A gold standard alignment (sure alignments)
|
||||
:type hypothesis: Alignment
|
||||
:param hypothesis: A hypothesis alignment (aka. candidate alignments)
|
||||
:type possible: Alignment or None
|
||||
:param possible: A gold standard reference of possible alignments
|
||||
(defaults to *reference* if None)
|
||||
:rtype: float or None
|
||||
"""
|
||||
|
||||
if possible is None:
|
||||
possible = reference
|
||||
else:
|
||||
assert reference.issubset(possible) # sanity check
|
||||
|
||||
return 1.0 - (len(hypothesis & reference) + len(hypothesis & possible)) / float(
|
||||
len(hypothesis) + len(reference)
|
||||
)
|
||||
195
backend/venv/Lib/site-packages/nltk/translate/nist_score.py
Normal file
195
backend/venv/Lib/site-packages/nltk/translate/nist_score.py
Normal file
@@ -0,0 +1,195 @@
|
||||
# Natural Language Toolkit: NIST Score
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors:
|
||||
# Contributors:
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""NIST score implementation."""
|
||||
|
||||
import fractions
|
||||
import math
|
||||
from collections import Counter
|
||||
|
||||
from nltk.util import ngrams
|
||||
|
||||
|
||||
def sentence_nist(references, hypothesis, n=5):
|
||||
"""
|
||||
Calculate NIST score from
|
||||
George Doddington. 2002. "Automatic evaluation of machine translation quality
|
||||
using n-gram co-occurrence statistics." Proceedings of HLT.
|
||||
Morgan Kaufmann Publishers Inc. https://dl.acm.org/citation.cfm?id=1289189.1289273
|
||||
|
||||
DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
|
||||
score. The official script used by NIST to compute BLEU and NIST score is
|
||||
mteval-14.pl. The main differences are:
|
||||
|
||||
- BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean.
|
||||
- NIST has a different brevity penalty
|
||||
- NIST score from mteval-14.pl has a self-contained tokenizer
|
||||
|
||||
Note: The mteval-14.pl includes a smoothing function for BLEU score that is NOT
|
||||
used in the NIST score computation.
|
||||
|
||||
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||||
... 'ensures', 'that', 'the', 'military', 'always',
|
||||
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
|
||||
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
|
||||
... 'forever', 'hearing', 'the', 'activity', 'guidebook',
|
||||
... 'that', 'party', 'direct']
|
||||
|
||||
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||||
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
||||
... 'heed', 'Party', 'commands']
|
||||
|
||||
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||||
... 'guarantees', 'the', 'military', 'forces', 'always',
|
||||
... 'being', 'under', 'the', 'command', 'of', 'the',
|
||||
... 'Party']
|
||||
|
||||
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||||
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||||
... 'of', 'the', 'party']
|
||||
|
||||
>>> sentence_nist([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
|
||||
3.3709...
|
||||
|
||||
>>> sentence_nist([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS
|
||||
1.4619...
|
||||
|
||||
:param references: reference sentences
|
||||
:type references: list(list(str))
|
||||
:param hypothesis: a hypothesis sentence
|
||||
:type hypothesis: list(str)
|
||||
:param n: highest n-gram order
|
||||
:type n: int
|
||||
"""
|
||||
return corpus_nist([references], [hypothesis], n)
|
||||
|
||||
|
||||
def corpus_nist(list_of_references, hypotheses, n=5):
|
||||
"""
|
||||
Calculate a single corpus-level NIST score (aka. system-level BLEU) for all
|
||||
the hypotheses and their respective references.
|
||||
|
||||
:param references: a corpus of lists of reference sentences, w.r.t. hypotheses
|
||||
:type references: list(list(list(str)))
|
||||
:param hypotheses: a list of hypothesis sentences
|
||||
:type hypotheses: list(list(str))
|
||||
:param n: highest n-gram order
|
||||
:type n: int
|
||||
"""
|
||||
# Before proceeding to compute NIST, perform sanity checks.
|
||||
assert len(list_of_references) == len(
|
||||
hypotheses
|
||||
), "The number of hypotheses and their reference(s) should be the same"
|
||||
|
||||
# Collect the ngram coounts from the reference sentences.
|
||||
ngram_freq = Counter()
|
||||
total_reference_words = 0
|
||||
for (
|
||||
references
|
||||
) in list_of_references: # For each source sent, there's a list of reference sents.
|
||||
for reference in references:
|
||||
# For each order of ngram, count the ngram occurrences.
|
||||
for i in range(1, n + 1):
|
||||
ngram_freq.update(ngrams(reference, i))
|
||||
total_reference_words += len(reference)
|
||||
|
||||
# Compute the information weights based on the reference sentences.
|
||||
# Eqn 2 in Doddington (2002):
|
||||
# Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ]
|
||||
information_weights = {}
|
||||
for _ngram in ngram_freq: # w_1 ... w_n
|
||||
_mgram = _ngram[:-1] # w_1 ... w_n-1
|
||||
# From https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v13a.pl#L546
|
||||
# it's computed as such:
|
||||
# denominator = ngram_freq[_mgram] if _mgram and _mgram in ngram_freq else denominator = total_reference_words
|
||||
# information_weights[_ngram] = -1 * math.log(ngram_freq[_ngram]/denominator) / math.log(2)
|
||||
#
|
||||
# Mathematically, it's equivalent to the our implementation:
|
||||
if _mgram and _mgram in ngram_freq:
|
||||
numerator = ngram_freq[_mgram]
|
||||
else:
|
||||
numerator = total_reference_words
|
||||
information_weights[_ngram] = math.log(numerator / ngram_freq[_ngram], 2)
|
||||
|
||||
# Micro-average.
|
||||
nist_precision_numerator_per_ngram = Counter()
|
||||
nist_precision_denominator_per_ngram = Counter()
|
||||
l_ref, l_sys = 0, 0
|
||||
# For each order of ngram.
|
||||
for i in range(1, n + 1):
|
||||
# Iterate through each hypothesis and their corresponding references.
|
||||
for references, hypothesis in zip(list_of_references, hypotheses):
|
||||
hyp_len = len(hypothesis)
|
||||
|
||||
# Find reference with the best NIST score.
|
||||
nist_score_per_ref = []
|
||||
for reference in references:
|
||||
_ref_len = len(reference)
|
||||
# Counter of ngrams in hypothesis.
|
||||
hyp_ngrams = (
|
||||
Counter(ngrams(hypothesis, i))
|
||||
if len(hypothesis) >= i
|
||||
else Counter()
|
||||
)
|
||||
ref_ngrams = (
|
||||
Counter(ngrams(reference, i)) if len(reference) >= i else Counter()
|
||||
)
|
||||
ngram_overlaps = hyp_ngrams & ref_ngrams
|
||||
# Precision part of the score in Eqn 3
|
||||
_numerator = sum(
|
||||
information_weights[_ngram] * count
|
||||
for _ngram, count in ngram_overlaps.items()
|
||||
)
|
||||
_denominator = sum(hyp_ngrams.values())
|
||||
_precision = 0 if _denominator == 0 else _numerator / _denominator
|
||||
nist_score_per_ref.append(
|
||||
(_precision, _numerator, _denominator, _ref_len)
|
||||
)
|
||||
# Best reference.
|
||||
precision, numerator, denominator, ref_len = max(nist_score_per_ref)
|
||||
nist_precision_numerator_per_ngram[i] += numerator
|
||||
nist_precision_denominator_per_ngram[i] += denominator
|
||||
l_ref += ref_len
|
||||
l_sys += hyp_len
|
||||
|
||||
# Final NIST micro-average mean aggregation.
|
||||
nist_precision = 0
|
||||
for i in nist_precision_numerator_per_ngram:
|
||||
precision = (
|
||||
nist_precision_numerator_per_ngram[i]
|
||||
/ nist_precision_denominator_per_ngram[i]
|
||||
)
|
||||
nist_precision += precision
|
||||
# Eqn 3 in Doddington(2002)
|
||||
return nist_precision * nist_length_penalty(l_ref, l_sys)
|
||||
|
||||
|
||||
def nist_length_penalty(ref_len, hyp_len):
|
||||
"""
|
||||
Calculates the NIST length penalty, from Eq. 3 in Doddington (2002)
|
||||
|
||||
penalty = exp( beta * log( min( len(hyp)/len(ref) , 1.0 )))
|
||||
|
||||
where,
|
||||
|
||||
`beta` is chosen to make the brevity penalty factor = 0.5 when the
|
||||
no. of words in the system output (hyp) is 2/3 of the average
|
||||
no. of words in the reference translation (ref)
|
||||
|
||||
The NIST penalty is different from BLEU's such that it minimize the impact
|
||||
of the score of small variations in the length of a translation.
|
||||
See Fig. 4 in Doddington (2002)
|
||||
"""
|
||||
ratio = hyp_len / ref_len
|
||||
if 0 < ratio < 1:
|
||||
ratio_x, score_x = 1.5, 0.5
|
||||
beta = math.log(score_x) / math.log(ratio_x) ** 2
|
||||
return math.exp(beta * math.log(ratio) ** 2)
|
||||
else: # ratio <= 0 or ratio >= 1
|
||||
return max(min(ratio, 1.0), 0.0)
|
||||
193
backend/venv/Lib/site-packages/nltk/translate/phrase_based.py
Normal file
193
backend/venv/Lib/site-packages/nltk/translate/phrase_based.py
Normal file
@@ -0,0 +1,193 @@
|
||||
# Natural Language Toolkit: Phrase Extraction Algorithm
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Liling Tan, Fredrik Hedman, Petra Barancikova
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
def extract(
|
||||
f_start,
|
||||
f_end,
|
||||
e_start,
|
||||
e_end,
|
||||
alignment,
|
||||
f_aligned,
|
||||
srctext,
|
||||
trgtext,
|
||||
srclen,
|
||||
trglen,
|
||||
max_phrase_length,
|
||||
):
|
||||
"""
|
||||
This function checks for alignment point consistency and extracts
|
||||
phrases using the chunk of consistent phrases.
|
||||
|
||||
A phrase pair (e, f ) is consistent with an alignment A if and only if:
|
||||
|
||||
(i) No English words in the phrase pair are aligned to words outside it.
|
||||
|
||||
∀e i ∈ e, (e i , f j ) ∈ A ⇒ f j ∈ f
|
||||
|
||||
(ii) No Foreign words in the phrase pair are aligned to words outside it.
|
||||
|
||||
∀f j ∈ f , (e i , f j ) ∈ A ⇒ e i ∈ e
|
||||
|
||||
(iii) The phrase pair contains at least one alignment point.
|
||||
|
||||
∃e i ∈ e ̄ , f j ∈ f ̄ s.t. (e i , f j ) ∈ A
|
||||
|
||||
:type f_start: int
|
||||
:param f_start: Starting index of the possible foreign language phrases
|
||||
:type f_end: int
|
||||
:param f_end: End index of the possible foreign language phrases
|
||||
:type e_start: int
|
||||
:param e_start: Starting index of the possible source language phrases
|
||||
:type e_end: int
|
||||
:param e_end: End index of the possible source language phrases
|
||||
:type srctext: list
|
||||
:param srctext: The source language tokens, a list of string.
|
||||
:type trgtext: list
|
||||
:param trgtext: The target language tokens, a list of string.
|
||||
:type srclen: int
|
||||
:param srclen: The number of tokens in the source language tokens.
|
||||
:type trglen: int
|
||||
:param trglen: The number of tokens in the target language tokens.
|
||||
"""
|
||||
|
||||
if f_end < 0: # 0-based indexing.
|
||||
return {}
|
||||
# Check if alignment points are consistent.
|
||||
for e, f in alignment:
|
||||
if (f_start <= f <= f_end) and (e < e_start or e > e_end):
|
||||
return {}
|
||||
|
||||
# Add phrase pairs (incl. additional unaligned f)
|
||||
phrases = set()
|
||||
fs = f_start
|
||||
while True:
|
||||
fe = min(f_end, f_start + max_phrase_length - 1)
|
||||
while True:
|
||||
# add phrase pair ([e_start, e_end], [fs, fe]) to set E
|
||||
# Need to +1 in range to include the end-point.
|
||||
src_phrase = " ".join(srctext[e_start : e_end + 1])
|
||||
trg_phrase = " ".join(trgtext[fs : fe + 1])
|
||||
# Include more data for later ordering.
|
||||
phrases.add(((e_start, e_end + 1), (fs, fe + 1), src_phrase, trg_phrase))
|
||||
fe += 1
|
||||
if fe in f_aligned or fe >= trglen:
|
||||
break
|
||||
fs -= 1
|
||||
if fs in f_aligned or fs < 0:
|
||||
break
|
||||
return phrases
|
||||
|
||||
|
||||
def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0):
|
||||
"""
|
||||
Phrase extraction algorithm extracts all consistent phrase pairs from
|
||||
a word-aligned sentence pair.
|
||||
|
||||
The idea is to loop over all possible source language (e) phrases and find
|
||||
the minimal foreign phrase (f) that matches each of them. Matching is done
|
||||
by identifying all alignment points for the source phrase and finding the
|
||||
shortest foreign phrase that includes all the foreign counterparts for the
|
||||
source words.
|
||||
|
||||
In short, a phrase alignment has to
|
||||
(a) contain all alignment points for all covered words
|
||||
(b) contain at least one alignment point
|
||||
|
||||
>>> srctext = "michael assumes that he will stay in the house"
|
||||
>>> trgtext = "michael geht davon aus , dass er im haus bleibt"
|
||||
>>> alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9),
|
||||
... (5,9), (6,7), (7,7), (8,8)]
|
||||
>>> phrases = phrase_extraction(srctext, trgtext, alignment)
|
||||
>>> for i in sorted(phrases):
|
||||
... print(i)
|
||||
...
|
||||
((0, 1), (0, 1), 'michael', 'michael')
|
||||
((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus')
|
||||
((0, 2), (0, 5), 'michael assumes', 'michael geht davon aus ,')
|
||||
((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass')
|
||||
((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er')
|
||||
((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt')
|
||||
((1, 2), (1, 4), 'assumes', 'geht davon aus')
|
||||
((1, 2), (1, 5), 'assumes', 'geht davon aus ,')
|
||||
((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass')
|
||||
((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er')
|
||||
((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt')
|
||||
((2, 3), (4, 6), 'that', ', dass')
|
||||
((2, 3), (5, 6), 'that', 'dass')
|
||||
((2, 4), (4, 7), 'that he', ', dass er')
|
||||
((2, 4), (5, 7), 'that he', 'dass er')
|
||||
((2, 9), (4, 10), 'that he will stay in the house', ', dass er im haus bleibt')
|
||||
((2, 9), (5, 10), 'that he will stay in the house', 'dass er im haus bleibt')
|
||||
((3, 4), (6, 7), 'he', 'er')
|
||||
((3, 9), (6, 10), 'he will stay in the house', 'er im haus bleibt')
|
||||
((4, 6), (9, 10), 'will stay', 'bleibt')
|
||||
((4, 9), (7, 10), 'will stay in the house', 'im haus bleibt')
|
||||
((6, 8), (7, 8), 'in the', 'im')
|
||||
((6, 9), (7, 9), 'in the house', 'im haus')
|
||||
((8, 9), (8, 9), 'house', 'haus')
|
||||
|
||||
:type srctext: str
|
||||
:param srctext: The sentence string from the source language.
|
||||
:type trgtext: str
|
||||
:param trgtext: The sentence string from the target language.
|
||||
:type alignment: list(tuple)
|
||||
:param alignment: The word alignment outputs as list of tuples, where
|
||||
the first elements of tuples are the source words' indices and
|
||||
second elements are the target words' indices. This is also the output
|
||||
format of nltk.translate.ibm1
|
||||
:rtype: list(tuple)
|
||||
:return: A list of tuples, each element in a list is a phrase and each
|
||||
phrase is a tuple made up of (i) its source location, (ii) its target
|
||||
location, (iii) the source phrase and (iii) the target phrase. The phrase
|
||||
list of tuples represents all the possible phrases extracted from the
|
||||
word alignments.
|
||||
:type max_phrase_length: int
|
||||
:param max_phrase_length: maximal phrase length, if 0 or not specified
|
||||
it is set to a length of the longer sentence (srctext or trgtext).
|
||||
"""
|
||||
|
||||
srctext = srctext.split() # e
|
||||
trgtext = trgtext.split() # f
|
||||
srclen = len(srctext) # len(e)
|
||||
trglen = len(trgtext) # len(f)
|
||||
# Keeps an index of which source/target words that are aligned.
|
||||
f_aligned = [j for _, j in alignment]
|
||||
max_phrase_length = max_phrase_length or max(srclen, trglen)
|
||||
|
||||
# set of phrase pairs BP
|
||||
bp = set()
|
||||
|
||||
for e_start in range(srclen):
|
||||
max_idx = min(srclen, e_start + max_phrase_length)
|
||||
for e_end in range(e_start, max_idx):
|
||||
# // find the minimally matching foreign phrase
|
||||
# (f start , f end ) = ( length(f), 0 )
|
||||
# f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1]
|
||||
f_start, f_end = trglen - 1, -1 # 0-based indexing
|
||||
|
||||
for e, f in alignment:
|
||||
if e_start <= e <= e_end:
|
||||
f_start = min(f, f_start)
|
||||
f_end = max(f, f_end)
|
||||
# add extract (f start , f end , e start , e end ) to set BP
|
||||
phrases = extract(
|
||||
f_start,
|
||||
f_end,
|
||||
e_start,
|
||||
e_end,
|
||||
alignment,
|
||||
f_aligned,
|
||||
srctext,
|
||||
trgtext,
|
||||
srclen,
|
||||
trglen,
|
||||
max_phrase_length,
|
||||
)
|
||||
if phrases:
|
||||
bp.update(phrases)
|
||||
return bp
|
||||
330
backend/venv/Lib/site-packages/nltk/translate/ribes_score.py
Normal file
330
backend/venv/Lib/site-packages/nltk/translate/ribes_score.py
Normal file
@@ -0,0 +1,330 @@
|
||||
# Natural Language Toolkit: RIBES Score
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Contributors: Katsuhito Sudoh, Liling Tan, Kasramvd, J.F.Sebastian
|
||||
# Mark Byers, ekhumoro, P. Ortiz
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
""" RIBES score implementation """
|
||||
|
||||
import math
|
||||
from itertools import islice
|
||||
|
||||
from nltk.util import choose, ngrams
|
||||
|
||||
|
||||
def sentence_ribes(references, hypothesis, alpha=0.25, beta=0.10):
|
||||
"""
|
||||
The RIBES (Rank-based Intuitive Bilingual Evaluation Score) from
|
||||
Hideki Isozaki, Tsutomu Hirao, Kevin Duh, Katsuhito Sudoh and
|
||||
Hajime Tsukada. 2010. "Automatic Evaluation of Translation Quality for
|
||||
Distant Language Pairs". In Proceedings of EMNLP.
|
||||
https://www.aclweb.org/anthology/D/D10/D10-1092.pdf
|
||||
|
||||
The generic RIBES scores used in shared task, e.g. Workshop for
|
||||
Asian Translation (WAT) uses the following RIBES calculations:
|
||||
|
||||
RIBES = kendall_tau * (alpha**p1) * (beta**bp)
|
||||
|
||||
Please note that this re-implementation differs from the official
|
||||
RIBES implementation and though it emulates the results as describe
|
||||
in the original paper, there are further optimization implemented
|
||||
in the official RIBES script.
|
||||
|
||||
Users are encouraged to use the official RIBES script instead of this
|
||||
implementation when evaluating your machine translation system. Refer
|
||||
to https://www.kecl.ntt.co.jp/icl/lirg/ribes/ for the official script.
|
||||
|
||||
:param references: a list of reference sentences
|
||||
:type references: list(list(str))
|
||||
:param hypothesis: a hypothesis sentence
|
||||
:type hypothesis: list(str)
|
||||
:param alpha: hyperparameter used as a prior for the unigram precision.
|
||||
:type alpha: float
|
||||
:param beta: hyperparameter used as a prior for the brevity penalty.
|
||||
:type beta: float
|
||||
:return: The best ribes score from one of the references.
|
||||
:rtype: float
|
||||
"""
|
||||
best_ribes = -1.0
|
||||
# Calculates RIBES for each reference and returns the best score.
|
||||
for reference in references:
|
||||
# Collects the *worder* from the ranked correlation alignments.
|
||||
worder = word_rank_alignment(reference, hypothesis)
|
||||
nkt = kendall_tau(worder)
|
||||
|
||||
# Calculates the brevity penalty
|
||||
bp = min(1.0, math.exp(1.0 - len(reference) / len(hypothesis)))
|
||||
|
||||
# Calculates the unigram precision, *p1*
|
||||
p1 = len(worder) / len(hypothesis)
|
||||
|
||||
_ribes = nkt * (p1**alpha) * (bp**beta)
|
||||
|
||||
if _ribes > best_ribes: # Keeps the best score.
|
||||
best_ribes = _ribes
|
||||
|
||||
return best_ribes
|
||||
|
||||
|
||||
def corpus_ribes(list_of_references, hypotheses, alpha=0.25, beta=0.10):
|
||||
"""
|
||||
This function "calculates RIBES for a system output (hypothesis) with
|
||||
multiple references, and returns "best" score among multi-references and
|
||||
individual scores. The scores are corpus-wise, i.e., averaged by the number
|
||||
of sentences." (c.f. RIBES version 1.03.1 code).
|
||||
|
||||
Different from BLEU's micro-average precision, RIBES calculates the
|
||||
macro-average precision by averaging the best RIBES score for each pair of
|
||||
hypothesis and its corresponding references
|
||||
|
||||
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||||
... 'ensures', 'that', 'the', 'military', 'always',
|
||||
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
>>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||||
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
||||
... 'heed', 'Party', 'commands']
|
||||
>>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||||
... 'guarantees', 'the', 'military', 'forces', 'always',
|
||||
... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
|
||||
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||||
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||||
... 'of', 'the', 'party']
|
||||
|
||||
>>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
|
||||
... 'interested', 'in', 'world', 'history']
|
||||
>>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
|
||||
... 'because', 'he', 'read', 'the', 'book']
|
||||
|
||||
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
|
||||
>>> hypotheses = [hyp1, hyp2]
|
||||
>>> round(corpus_ribes(list_of_references, hypotheses),4)
|
||||
0.3597
|
||||
|
||||
:param references: a corpus of lists of reference sentences, w.r.t. hypotheses
|
||||
:type references: list(list(list(str)))
|
||||
:param hypotheses: a list of hypothesis sentences
|
||||
:type hypotheses: list(list(str))
|
||||
:param alpha: hyperparameter used as a prior for the unigram precision.
|
||||
:type alpha: float
|
||||
:param beta: hyperparameter used as a prior for the brevity penalty.
|
||||
:type beta: float
|
||||
:return: The best ribes score from one of the references.
|
||||
:rtype: float
|
||||
"""
|
||||
corpus_best_ribes = 0.0
|
||||
# Iterate through each hypothesis and their corresponding references.
|
||||
for references, hypothesis in zip(list_of_references, hypotheses):
|
||||
corpus_best_ribes += sentence_ribes(references, hypothesis, alpha, beta)
|
||||
return corpus_best_ribes / len(hypotheses)
|
||||
|
||||
|
||||
def position_of_ngram(ngram, sentence):
|
||||
"""
|
||||
This function returns the position of the first instance of the ngram
|
||||
appearing in a sentence.
|
||||
|
||||
Note that one could also use string as follows but the code is a little
|
||||
convoluted with type casting back and forth:
|
||||
|
||||
char_pos = ' '.join(sent)[:' '.join(sent).index(' '.join(ngram))]
|
||||
word_pos = char_pos.count(' ')
|
||||
|
||||
Another way to conceive this is:
|
||||
|
||||
return next(i for i, ng in enumerate(ngrams(sentence, len(ngram)))
|
||||
if ng == ngram)
|
||||
|
||||
:param ngram: The ngram that needs to be searched
|
||||
:type ngram: tuple
|
||||
:param sentence: The list of tokens to search from.
|
||||
:type sentence: list(str)
|
||||
"""
|
||||
# Iterates through the ngrams in sentence.
|
||||
for i, sublist in enumerate(ngrams(sentence, len(ngram))):
|
||||
# Returns the index of the word when ngram matches.
|
||||
if ngram == sublist:
|
||||
return i
|
||||
|
||||
|
||||
def word_rank_alignment(reference, hypothesis, character_based=False):
|
||||
"""
|
||||
This is the word rank alignment algorithm described in the paper to produce
|
||||
the *worder* list, i.e. a list of word indices of the hypothesis word orders
|
||||
w.r.t. the list of reference words.
|
||||
|
||||
Below is (H0, R0) example from the Isozaki et al. 2010 paper,
|
||||
note the examples are indexed from 1 but the results here are indexed from 0:
|
||||
|
||||
>>> ref = str('he was interested in world history because he '
|
||||
... 'read the book').split()
|
||||
>>> hyp = str('he read the book because he was interested in world '
|
||||
... 'history').split()
|
||||
>>> word_rank_alignment(ref, hyp)
|
||||
[7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
|
||||
|
||||
The (H1, R1) example from the paper, note the 0th index:
|
||||
|
||||
>>> ref = 'John hit Bob yesterday'.split()
|
||||
>>> hyp = 'Bob hit John yesterday'.split()
|
||||
>>> word_rank_alignment(ref, hyp)
|
||||
[2, 1, 0, 3]
|
||||
|
||||
Here is the (H2, R2) example from the paper, note the 0th index here too:
|
||||
|
||||
>>> ref = 'the boy read the book'.split()
|
||||
>>> hyp = 'the book was read by the boy'.split()
|
||||
>>> word_rank_alignment(ref, hyp)
|
||||
[3, 4, 2, 0, 1]
|
||||
|
||||
:param reference: a reference sentence
|
||||
:type reference: list(str)
|
||||
:param hypothesis: a hypothesis sentence
|
||||
:type hypothesis: list(str)
|
||||
"""
|
||||
worder = []
|
||||
hyp_len = len(hypothesis)
|
||||
# Stores a list of possible ngrams from the reference sentence.
|
||||
# This is used for matching context window later in the algorithm.
|
||||
ref_ngrams = []
|
||||
hyp_ngrams = []
|
||||
for n in range(1, len(reference) + 1):
|
||||
for ng in ngrams(reference, n):
|
||||
ref_ngrams.append(ng)
|
||||
for ng in ngrams(hypothesis, n):
|
||||
hyp_ngrams.append(ng)
|
||||
for i, h_word in enumerate(hypothesis):
|
||||
# If word is not in the reference, continue.
|
||||
if h_word not in reference:
|
||||
continue
|
||||
# If we can determine one-to-one word correspondence for unigrams that
|
||||
# only appear once in both the reference and hypothesis.
|
||||
elif hypothesis.count(h_word) == reference.count(h_word) == 1:
|
||||
worder.append(reference.index(h_word))
|
||||
else:
|
||||
max_window_size = max(i, hyp_len - i + 1)
|
||||
for window in range(1, max_window_size):
|
||||
if i + window < hyp_len: # If searching the right context is possible.
|
||||
# Retrieve the right context window.
|
||||
right_context_ngram = tuple(islice(hypothesis, i, i + window + 1))
|
||||
num_times_in_ref = ref_ngrams.count(right_context_ngram)
|
||||
num_times_in_hyp = hyp_ngrams.count(right_context_ngram)
|
||||
# If ngram appears only once in both ref and hyp.
|
||||
if num_times_in_ref == num_times_in_hyp == 1:
|
||||
# Find the position of ngram that matched the reference.
|
||||
pos = position_of_ngram(right_context_ngram, reference)
|
||||
worder.append(pos) # Add the positions of the ngram.
|
||||
break
|
||||
if window <= i: # If searching the left context is possible.
|
||||
# Retrieve the left context window.
|
||||
left_context_ngram = tuple(islice(hypothesis, i - window, i + 1))
|
||||
num_times_in_ref = ref_ngrams.count(left_context_ngram)
|
||||
num_times_in_hyp = hyp_ngrams.count(left_context_ngram)
|
||||
if num_times_in_ref == num_times_in_hyp == 1:
|
||||
# Find the position of ngram that matched the reference.
|
||||
pos = position_of_ngram(left_context_ngram, reference)
|
||||
# Add the positions of the ngram.
|
||||
worder.append(pos + len(left_context_ngram) - 1)
|
||||
break
|
||||
return worder
|
||||
|
||||
|
||||
def find_increasing_sequences(worder):
|
||||
"""
|
||||
Given the *worder* list, this function groups monotonic +1 sequences.
|
||||
|
||||
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
|
||||
>>> list(find_increasing_sequences(worder))
|
||||
[(7, 8, 9, 10), (0, 1, 2, 3, 4, 5)]
|
||||
|
||||
:param worder: The worder list output from word_rank_alignment
|
||||
:param type: list(int)
|
||||
"""
|
||||
items = iter(worder)
|
||||
a, b = None, next(items, None)
|
||||
result = [b]
|
||||
while b is not None:
|
||||
a, b = b, next(items, None)
|
||||
if b is not None and a + 1 == b:
|
||||
result.append(b)
|
||||
else:
|
||||
if len(result) > 1:
|
||||
yield tuple(result)
|
||||
result = [b]
|
||||
|
||||
|
||||
def kendall_tau(worder, normalize=True):
|
||||
"""
|
||||
Calculates the Kendall's Tau correlation coefficient given the *worder*
|
||||
list of word alignments from word_rank_alignment(), using the formula:
|
||||
|
||||
tau = 2 * num_increasing_pairs / num_possible_pairs -1
|
||||
|
||||
Note that the no. of increasing pairs can be discontinuous in the *worder*
|
||||
list and each each increasing sequence can be tabulated as choose(len(seq), 2)
|
||||
no. of increasing pairs, e.g.
|
||||
|
||||
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
|
||||
>>> number_possible_pairs = choose(len(worder), 2)
|
||||
>>> round(kendall_tau(worder, normalize=False),3)
|
||||
-0.236
|
||||
>>> round(kendall_tau(worder),3)
|
||||
0.382
|
||||
|
||||
:param worder: The worder list output from word_rank_alignment
|
||||
:type worder: list(int)
|
||||
:param normalize: Flag to indicate normalization to between 0.0 and 1.0.
|
||||
:type normalize: boolean
|
||||
:return: The Kendall's Tau correlation coefficient.
|
||||
:rtype: float
|
||||
"""
|
||||
worder_len = len(worder)
|
||||
# With worder_len < 2, `choose(worder_len, 2)` will be 0.
|
||||
# As we divide by this, it will give a ZeroDivisionError.
|
||||
# To avoid this, we can just return the lowest possible score.
|
||||
if worder_len < 2:
|
||||
tau = -1
|
||||
else:
|
||||
# Extract the groups of increasing/monotonic sequences.
|
||||
increasing_sequences = find_increasing_sequences(worder)
|
||||
# Calculate no. of increasing_pairs in *worder* list.
|
||||
num_increasing_pairs = sum(choose(len(seq), 2) for seq in increasing_sequences)
|
||||
# Calculate no. of possible pairs.
|
||||
num_possible_pairs = choose(worder_len, 2)
|
||||
# Kendall's Tau computation.
|
||||
tau = 2 * num_increasing_pairs / num_possible_pairs - 1
|
||||
if normalize: # If normalized, the tau output falls between 0.0 to 1.0
|
||||
return (tau + 1) / 2
|
||||
else: # Otherwise, the tau outputs falls between -1.0 to +1.0
|
||||
return tau
|
||||
|
||||
|
||||
def spearman_rho(worder, normalize=True):
|
||||
"""
|
||||
Calculates the Spearman's Rho correlation coefficient given the *worder*
|
||||
list of word alignment from word_rank_alignment(), using the formula:
|
||||
|
||||
rho = 1 - sum(d**2) / choose(len(worder)+1, 3)
|
||||
|
||||
Given that d is the sum of difference between the *worder* list of indices
|
||||
and the original word indices from the reference sentence.
|
||||
|
||||
Using the (H0,R0) and (H5, R5) example from the paper
|
||||
|
||||
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
|
||||
>>> round(spearman_rho(worder, normalize=False), 3)
|
||||
-0.591
|
||||
>>> round(spearman_rho(worder), 3)
|
||||
0.205
|
||||
|
||||
:param worder: The worder list output from word_rank_alignment
|
||||
:param type: list(int)
|
||||
"""
|
||||
worder_len = len(worder)
|
||||
sum_d_square = sum((wi - i) ** 2 for wi, i in zip(worder, range(worder_len)))
|
||||
rho = 1 - sum_d_square / choose(worder_len + 1, 3)
|
||||
|
||||
if normalize: # If normalized, the rho output falls between 0.0 to 1.0
|
||||
return (rho + 1) / 2
|
||||
else: # Otherwise, the rho outputs falls between -1.0 to +1.0
|
||||
return rho
|
||||
515
backend/venv/Lib/site-packages/nltk/translate/stack_decoder.py
Normal file
515
backend/venv/Lib/site-packages/nltk/translate/stack_decoder.py
Normal file
@@ -0,0 +1,515 @@
|
||||
# Natural Language Toolkit: Stack decoder
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A decoder that uses stacks to implement phrase-based translation.
|
||||
|
||||
In phrase-based translation, the source sentence is segmented into
|
||||
phrases of one or more words, and translations for those phrases are
|
||||
used to build the target sentence.
|
||||
|
||||
Hypothesis data structures are used to keep track of the source words
|
||||
translated so far and the partial output. A hypothesis can be expanded
|
||||
by selecting an untranslated phrase, looking up its translation in a
|
||||
phrase table, and appending that translation to the partial output.
|
||||
Translation is complete when a hypothesis covers all source words.
|
||||
|
||||
The search space is huge because the source sentence can be segmented
|
||||
in different ways, the source phrases can be selected in any order,
|
||||
and there could be multiple translations for the same source phrase in
|
||||
the phrase table. To make decoding tractable, stacks are used to limit
|
||||
the number of candidate hypotheses by doing histogram and/or threshold
|
||||
pruning.
|
||||
|
||||
Hypotheses with the same number of words translated are placed in the
|
||||
same stack. In histogram pruning, each stack has a size limit, and
|
||||
the hypothesis with the lowest score is removed when the stack is full.
|
||||
In threshold pruning, hypotheses that score below a certain threshold
|
||||
of the best hypothesis in that stack are removed.
|
||||
|
||||
Hypothesis scoring can include various factors such as phrase
|
||||
translation probability, language model probability, length of
|
||||
translation, cost of remaining words to be translated, and so on.
|
||||
|
||||
|
||||
References:
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from math import log
|
||||
|
||||
|
||||
class StackDecoder:
|
||||
"""
|
||||
Phrase-based stack decoder for machine translation
|
||||
|
||||
>>> from nltk.translate import PhraseTable
|
||||
>>> phrase_table = PhraseTable()
|
||||
>>> phrase_table.add(('niemand',), ('nobody',), log(0.8))
|
||||
>>> phrase_table.add(('niemand',), ('no', 'one'), log(0.2))
|
||||
>>> phrase_table.add(('erwartet',), ('expects',), log(0.8))
|
||||
>>> phrase_table.add(('erwartet',), ('expecting',), log(0.2))
|
||||
>>> phrase_table.add(('niemand', 'erwartet'), ('one', 'does', 'not', 'expect'), log(0.1))
|
||||
>>> phrase_table.add(('die', 'spanische', 'inquisition'), ('the', 'spanish', 'inquisition'), log(0.8))
|
||||
>>> phrase_table.add(('!',), ('!',), log(0.8))
|
||||
|
||||
>>> # nltk.model should be used here once it is implemented
|
||||
>>> from collections import defaultdict
|
||||
>>> language_prob = defaultdict(lambda: -999.0)
|
||||
>>> language_prob[('nobody',)] = log(0.5)
|
||||
>>> language_prob[('expects',)] = log(0.4)
|
||||
>>> language_prob[('the', 'spanish', 'inquisition')] = log(0.2)
|
||||
>>> language_prob[('!',)] = log(0.1)
|
||||
>>> language_model = type('',(object,),{'probability_change': lambda self, context, phrase: language_prob[phrase], 'probability': lambda self, phrase: language_prob[phrase]})()
|
||||
|
||||
>>> stack_decoder = StackDecoder(phrase_table, language_model)
|
||||
|
||||
>>> stack_decoder.translate(['niemand', 'erwartet', 'die', 'spanische', 'inquisition', '!'])
|
||||
['nobody', 'expects', 'the', 'spanish', 'inquisition', '!']
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, phrase_table, language_model):
|
||||
"""
|
||||
:param phrase_table: Table of translations for source language
|
||||
phrases and the log probabilities for those translations.
|
||||
:type phrase_table: PhraseTable
|
||||
|
||||
:param language_model: Target language model. Must define a
|
||||
``probability_change`` method that calculates the change in
|
||||
log probability of a sentence, if a given string is appended
|
||||
to it.
|
||||
This interface is experimental and will likely be replaced
|
||||
with nltk.model once it is implemented.
|
||||
:type language_model: object
|
||||
"""
|
||||
self.phrase_table = phrase_table
|
||||
self.language_model = language_model
|
||||
|
||||
self.word_penalty = 0.0
|
||||
"""
|
||||
float: Influences the translation length exponentially.
|
||||
If positive, shorter translations are preferred.
|
||||
If negative, longer translations are preferred.
|
||||
If zero, no penalty is applied.
|
||||
"""
|
||||
|
||||
self.beam_threshold = 0.0
|
||||
"""
|
||||
float: Hypotheses that score below this factor of the best
|
||||
hypothesis in a stack are dropped from consideration.
|
||||
Value between 0.0 and 1.0.
|
||||
"""
|
||||
|
||||
self.stack_size = 100
|
||||
"""
|
||||
int: Maximum number of hypotheses to consider in a stack.
|
||||
Higher values increase the likelihood of a good translation,
|
||||
but increases processing time.
|
||||
"""
|
||||
|
||||
self.__distortion_factor = 0.5
|
||||
self.__compute_log_distortion()
|
||||
|
||||
@property
|
||||
def distortion_factor(self):
|
||||
"""
|
||||
float: Amount of reordering of source phrases.
|
||||
Lower values favour monotone translation, suitable when
|
||||
word order is similar for both source and target languages.
|
||||
Value between 0.0 and 1.0. Default 0.5.
|
||||
"""
|
||||
return self.__distortion_factor
|
||||
|
||||
@distortion_factor.setter
|
||||
def distortion_factor(self, d):
|
||||
self.__distortion_factor = d
|
||||
self.__compute_log_distortion()
|
||||
|
||||
def __compute_log_distortion(self):
|
||||
# cache log(distortion_factor) so we don't have to recompute it
|
||||
# when scoring hypotheses
|
||||
if self.__distortion_factor == 0.0:
|
||||
self.__log_distortion_factor = log(1e-9) # 1e-9 is almost zero
|
||||
else:
|
||||
self.__log_distortion_factor = log(self.__distortion_factor)
|
||||
|
||||
def translate(self, src_sentence):
|
||||
"""
|
||||
:param src_sentence: Sentence to be translated
|
||||
:type src_sentence: list(str)
|
||||
|
||||
:return: Translated sentence
|
||||
:rtype: list(str)
|
||||
"""
|
||||
sentence = tuple(src_sentence) # prevent accidental modification
|
||||
sentence_length = len(sentence)
|
||||
stacks = [
|
||||
_Stack(self.stack_size, self.beam_threshold)
|
||||
for _ in range(0, sentence_length + 1)
|
||||
]
|
||||
empty_hypothesis = _Hypothesis()
|
||||
stacks[0].push(empty_hypothesis)
|
||||
|
||||
all_phrases = self.find_all_src_phrases(sentence)
|
||||
future_score_table = self.compute_future_scores(sentence)
|
||||
for stack in stacks:
|
||||
for hypothesis in stack:
|
||||
possible_expansions = StackDecoder.valid_phrases(
|
||||
all_phrases, hypothesis
|
||||
)
|
||||
for src_phrase_span in possible_expansions:
|
||||
src_phrase = sentence[src_phrase_span[0] : src_phrase_span[1]]
|
||||
for translation_option in self.phrase_table.translations_for(
|
||||
src_phrase
|
||||
):
|
||||
raw_score = self.expansion_score(
|
||||
hypothesis, translation_option, src_phrase_span
|
||||
)
|
||||
new_hypothesis = _Hypothesis(
|
||||
raw_score=raw_score,
|
||||
src_phrase_span=src_phrase_span,
|
||||
trg_phrase=translation_option.trg_phrase,
|
||||
previous=hypothesis,
|
||||
)
|
||||
new_hypothesis.future_score = self.future_score(
|
||||
new_hypothesis, future_score_table, sentence_length
|
||||
)
|
||||
total_words = new_hypothesis.total_translated_words()
|
||||
stacks[total_words].push(new_hypothesis)
|
||||
|
||||
if not stacks[sentence_length]:
|
||||
warnings.warn(
|
||||
"Unable to translate all words. "
|
||||
"The source sentence contains words not in "
|
||||
"the phrase table"
|
||||
)
|
||||
# Instead of returning empty output, perhaps a partial
|
||||
# translation could be returned
|
||||
return []
|
||||
|
||||
best_hypothesis = stacks[sentence_length].best()
|
||||
return best_hypothesis.translation_so_far()
|
||||
|
||||
def find_all_src_phrases(self, src_sentence):
|
||||
"""
|
||||
Finds all subsequences in src_sentence that have a phrase
|
||||
translation in the translation table
|
||||
|
||||
:type src_sentence: tuple(str)
|
||||
|
||||
:return: Subsequences that have a phrase translation,
|
||||
represented as a table of lists of end positions.
|
||||
For example, if result[2] is [5, 6, 9], then there are
|
||||
three phrases starting from position 2 in ``src_sentence``,
|
||||
ending at positions 5, 6, and 9 exclusive. The list of
|
||||
ending positions are in ascending order.
|
||||
:rtype: list(list(int))
|
||||
"""
|
||||
sentence_length = len(src_sentence)
|
||||
phrase_indices = [[] for _ in src_sentence]
|
||||
for start in range(0, sentence_length):
|
||||
for end in range(start + 1, sentence_length + 1):
|
||||
potential_phrase = src_sentence[start:end]
|
||||
if potential_phrase in self.phrase_table:
|
||||
phrase_indices[start].append(end)
|
||||
return phrase_indices
|
||||
|
||||
def compute_future_scores(self, src_sentence):
|
||||
"""
|
||||
Determines the approximate scores for translating every
|
||||
subsequence in ``src_sentence``
|
||||
|
||||
Future scores can be used a look-ahead to determine the
|
||||
difficulty of translating the remaining parts of a src_sentence.
|
||||
|
||||
:type src_sentence: tuple(str)
|
||||
|
||||
:return: Scores of subsequences referenced by their start and
|
||||
end positions. For example, result[2][5] is the score of the
|
||||
subsequence covering positions 2, 3, and 4.
|
||||
:rtype: dict(int: (dict(int): float))
|
||||
"""
|
||||
scores = defaultdict(lambda: defaultdict(lambda: float("-inf")))
|
||||
for seq_length in range(1, len(src_sentence) + 1):
|
||||
for start in range(0, len(src_sentence) - seq_length + 1):
|
||||
end = start + seq_length
|
||||
phrase = src_sentence[start:end]
|
||||
if phrase in self.phrase_table:
|
||||
score = self.phrase_table.translations_for(phrase)[
|
||||
0
|
||||
].log_prob # pick best (first) translation
|
||||
# Warning: API of language_model is subject to change
|
||||
score += self.language_model.probability(phrase)
|
||||
scores[start][end] = score
|
||||
|
||||
# check if a better score can be obtained by combining
|
||||
# two child subsequences
|
||||
for mid in range(start + 1, end):
|
||||
combined_score = scores[start][mid] + scores[mid][end]
|
||||
if combined_score > scores[start][end]:
|
||||
scores[start][end] = combined_score
|
||||
return scores
|
||||
|
||||
def future_score(self, hypothesis, future_score_table, sentence_length):
|
||||
"""
|
||||
Determines the approximate score for translating the
|
||||
untranslated words in ``hypothesis``
|
||||
"""
|
||||
score = 0.0
|
||||
for span in hypothesis.untranslated_spans(sentence_length):
|
||||
score += future_score_table[span[0]][span[1]]
|
||||
return score
|
||||
|
||||
def expansion_score(self, hypothesis, translation_option, src_phrase_span):
|
||||
"""
|
||||
Calculate the score of expanding ``hypothesis`` with
|
||||
``translation_option``
|
||||
|
||||
:param hypothesis: Hypothesis being expanded
|
||||
:type hypothesis: _Hypothesis
|
||||
|
||||
:param translation_option: Information about the proposed expansion
|
||||
:type translation_option: PhraseTableEntry
|
||||
|
||||
:param src_phrase_span: Word position span of the source phrase
|
||||
:type src_phrase_span: tuple(int, int)
|
||||
"""
|
||||
score = hypothesis.raw_score
|
||||
score += translation_option.log_prob
|
||||
# The API of language_model is subject to change; it could accept
|
||||
# a string, a list of words, and/or some other type
|
||||
score += self.language_model.probability_change(
|
||||
hypothesis, translation_option.trg_phrase
|
||||
)
|
||||
score += self.distortion_score(hypothesis, src_phrase_span)
|
||||
score -= self.word_penalty * len(translation_option.trg_phrase)
|
||||
return score
|
||||
|
||||
def distortion_score(self, hypothesis, next_src_phrase_span):
|
||||
if not hypothesis.src_phrase_span:
|
||||
return 0.0
|
||||
next_src_phrase_start = next_src_phrase_span[0]
|
||||
prev_src_phrase_end = hypothesis.src_phrase_span[1]
|
||||
distortion_distance = next_src_phrase_start - prev_src_phrase_end
|
||||
return abs(distortion_distance) * self.__log_distortion_factor
|
||||
|
||||
@staticmethod
|
||||
def valid_phrases(all_phrases_from, hypothesis):
|
||||
"""
|
||||
Extract phrases from ``all_phrases_from`` that contains words
|
||||
that have not been translated by ``hypothesis``
|
||||
|
||||
:param all_phrases_from: Phrases represented by their spans, in
|
||||
the same format as the return value of
|
||||
``find_all_src_phrases``
|
||||
:type all_phrases_from: list(list(int))
|
||||
|
||||
:type hypothesis: _Hypothesis
|
||||
|
||||
:return: A list of phrases, represented by their spans, that
|
||||
cover untranslated positions.
|
||||
:rtype: list(tuple(int, int))
|
||||
"""
|
||||
untranslated_spans = hypothesis.untranslated_spans(len(all_phrases_from))
|
||||
valid_phrases = []
|
||||
for available_span in untranslated_spans:
|
||||
start = available_span[0]
|
||||
available_end = available_span[1]
|
||||
while start < available_end:
|
||||
for phrase_end in all_phrases_from[start]:
|
||||
if phrase_end > available_end:
|
||||
# Subsequent elements in all_phrases_from[start]
|
||||
# will also be > available_end, since the
|
||||
# elements are in ascending order
|
||||
break
|
||||
valid_phrases.append((start, phrase_end))
|
||||
start += 1
|
||||
return valid_phrases
|
||||
|
||||
|
||||
class _Hypothesis:
|
||||
"""
|
||||
Partial solution to a translation.
|
||||
|
||||
Records the word positions of the phrase being translated, its
|
||||
translation, raw score, and the cost of the untranslated parts of
|
||||
the sentence. When the next phrase is selected to build upon the
|
||||
partial solution, a new _Hypothesis object is created, with a back
|
||||
pointer to the previous hypothesis.
|
||||
|
||||
To find out which words have been translated so far, look at the
|
||||
``src_phrase_span`` in the hypothesis chain. Similarly, the
|
||||
translation output can be found by traversing up the chain.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
raw_score=0.0,
|
||||
src_phrase_span=(),
|
||||
trg_phrase=(),
|
||||
previous=None,
|
||||
future_score=0.0,
|
||||
):
|
||||
"""
|
||||
:param raw_score: Likelihood of hypothesis so far.
|
||||
Higher is better. Does not account for untranslated words.
|
||||
:type raw_score: float
|
||||
|
||||
:param src_phrase_span: Span of word positions covered by the
|
||||
source phrase in this hypothesis expansion. For example,
|
||||
(2, 5) means that the phrase is from the second word up to,
|
||||
but not including the fifth word in the source sentence.
|
||||
:type src_phrase_span: tuple(int)
|
||||
|
||||
:param trg_phrase: Translation of the source phrase in this
|
||||
hypothesis expansion
|
||||
:type trg_phrase: tuple(str)
|
||||
|
||||
:param previous: Previous hypothesis before expansion to this one
|
||||
:type previous: _Hypothesis
|
||||
|
||||
:param future_score: Approximate score for translating the
|
||||
remaining words not covered by this hypothesis. Higher means
|
||||
that the remaining words are easier to translate.
|
||||
:type future_score: float
|
||||
"""
|
||||
self.raw_score = raw_score
|
||||
self.src_phrase_span = src_phrase_span
|
||||
self.trg_phrase = trg_phrase
|
||||
self.previous = previous
|
||||
self.future_score = future_score
|
||||
|
||||
def score(self):
|
||||
"""
|
||||
Overall score of hypothesis after accounting for local and
|
||||
global features
|
||||
"""
|
||||
return self.raw_score + self.future_score
|
||||
|
||||
def untranslated_spans(self, sentence_length):
|
||||
"""
|
||||
Starting from each untranslated word, find the longest
|
||||
continuous span of untranslated positions
|
||||
|
||||
:param sentence_length: Length of source sentence being
|
||||
translated by the hypothesis
|
||||
:type sentence_length: int
|
||||
|
||||
:rtype: list(tuple(int, int))
|
||||
"""
|
||||
translated_positions = self.translated_positions()
|
||||
translated_positions.sort()
|
||||
translated_positions.append(sentence_length) # add sentinel position
|
||||
|
||||
untranslated_spans = []
|
||||
start = 0
|
||||
# each untranslated span must end in one of the translated_positions
|
||||
for end in translated_positions:
|
||||
if start < end:
|
||||
untranslated_spans.append((start, end))
|
||||
start = end + 1
|
||||
|
||||
return untranslated_spans
|
||||
|
||||
def translated_positions(self):
|
||||
"""
|
||||
List of positions in the source sentence of words already
|
||||
translated. The list is not sorted.
|
||||
|
||||
:rtype: list(int)
|
||||
"""
|
||||
translated_positions = []
|
||||
current_hypothesis = self
|
||||
while current_hypothesis.previous is not None:
|
||||
translated_span = current_hypothesis.src_phrase_span
|
||||
translated_positions.extend(range(translated_span[0], translated_span[1]))
|
||||
current_hypothesis = current_hypothesis.previous
|
||||
return translated_positions
|
||||
|
||||
def total_translated_words(self):
|
||||
return len(self.translated_positions())
|
||||
|
||||
def translation_so_far(self):
|
||||
translation = []
|
||||
self.__build_translation(self, translation)
|
||||
return translation
|
||||
|
||||
def __build_translation(self, hypothesis, output):
|
||||
if hypothesis.previous is None:
|
||||
return
|
||||
self.__build_translation(hypothesis.previous, output)
|
||||
output.extend(hypothesis.trg_phrase)
|
||||
|
||||
|
||||
class _Stack:
|
||||
"""
|
||||
Collection of _Hypothesis objects
|
||||
"""
|
||||
|
||||
def __init__(self, max_size=100, beam_threshold=0.0):
|
||||
"""
|
||||
:param beam_threshold: Hypotheses that score less than this
|
||||
factor of the best hypothesis are discarded from the stack.
|
||||
Value must be between 0.0 and 1.0.
|
||||
:type beam_threshold: float
|
||||
"""
|
||||
self.max_size = max_size
|
||||
self.items = []
|
||||
|
||||
if beam_threshold == 0.0:
|
||||
self.__log_beam_threshold = float("-inf")
|
||||
else:
|
||||
self.__log_beam_threshold = log(beam_threshold)
|
||||
|
||||
def push(self, hypothesis):
|
||||
"""
|
||||
Add ``hypothesis`` to the stack.
|
||||
Removes lowest scoring hypothesis if the stack is full.
|
||||
After insertion, hypotheses that score less than
|
||||
``beam_threshold`` times the score of the best hypothesis
|
||||
are removed.
|
||||
"""
|
||||
self.items.append(hypothesis)
|
||||
self.items.sort(key=lambda h: h.score(), reverse=True)
|
||||
while len(self.items) > self.max_size:
|
||||
self.items.pop()
|
||||
self.threshold_prune()
|
||||
|
||||
def threshold_prune(self):
|
||||
if not self.items:
|
||||
return
|
||||
# log(score * beam_threshold) = log(score) + log(beam_threshold)
|
||||
threshold = self.items[0].score() + self.__log_beam_threshold
|
||||
for hypothesis in reversed(self.items):
|
||||
if hypothesis.score() < threshold:
|
||||
self.items.pop()
|
||||
else:
|
||||
break
|
||||
|
||||
def best(self):
|
||||
"""
|
||||
:return: Hypothesis with the highest score in the stack
|
||||
:rtype: _Hypothesis
|
||||
"""
|
||||
if self.items:
|
||||
return self.items[0]
|
||||
return None
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.items)
|
||||
|
||||
def __contains__(self, hypothesis):
|
||||
return hypothesis in self.items
|
||||
|
||||
def __bool__(self):
|
||||
return len(self.items) != 0
|
||||
|
||||
__nonzero__ = __bool__
|
||||
Reference in New Issue
Block a user