Initial commit

2026-02-01 09:31:38 +01:00
commit e02db93960
4396 changed files with 1511612 additions and 0 deletions
--- a/backend/venv/Lib/site-packages/nltk/tokenize/init.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/init.py
@@ -0,0 +1,145 @@
+# Natural Language Toolkit: Tokenizers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+# Contributors: matthewmc, clouds56
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+r"""
+NLTK Tokenizer Package
+
+Tokenizers divide strings into lists of substrings.  For example,
+tokenizers can be used to find the words and punctuation in a string:
+
+    >>> from nltk.tokenize import word_tokenize
+    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
+    ... two of them.\n\nThanks.'''
+    >>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
+    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+
+This particular tokenizer requires the Punkt sentence tokenization
+models to be installed. NLTK also provides a simpler,
+regular-expression based tokenizer, which splits text on whitespace
+and punctuation:
+
+    >>> from nltk.tokenize import wordpunct_tokenize
+    >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
+    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+
+We can also operate at the level of sentences, using the sentence
+tokenizer directly as follows:
+
+    >>> from nltk.tokenize import sent_tokenize, word_tokenize
+    >>> sent_tokenize(s)
+    ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
+    >>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE
+    [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
+    ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]
+
+Caution: when tokenizing a Unicode string, make sure you are not
+using an encoded version of the string (it may be necessary to
+decode it first, e.g. with ``s.decode("utf8")``.
+
+NLTK tokenizers can produce token-spans, represented as tuples of integers
+having the same semantics as string slices, to support efficient comparison
+of tokenizers.  (These methods are implemented as generators.)
+
+    >>> from nltk.tokenize import WhitespaceTokenizer
+    >>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE
+    [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
+    (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
+
+There are numerous ways to tokenize text.  If you need more control over
+tokenization, see the other methods provided in this package.
+
+For further information, please see Chapter 3 of the NLTK book.
+"""
+
+import functools
+import re
+
+from nltk.data import load
+from nltk.tokenize.casual import TweetTokenizer, casual_tokenize
+from nltk.tokenize.destructive import NLTKWordTokenizer
+from nltk.tokenize.legality_principle import LegalitySyllableTokenizer
+from nltk.tokenize.mwe import MWETokenizer
+from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTokenizer
+from nltk.tokenize.regexp import (
+    BlanklineTokenizer,
+    RegexpTokenizer,
+    WhitespaceTokenizer,
+    WordPunctTokenizer,
+    blankline_tokenize,
+    regexp_tokenize,
+    wordpunct_tokenize,
+)
+from nltk.tokenize.repp import ReppTokenizer
+from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize
+from nltk.tokenize.simple import (
+    LineTokenizer,
+    SpaceTokenizer,
+    TabTokenizer,
+    line_tokenize,
+)
+from nltk.tokenize.sonority_sequencing import SyllableTokenizer
+from nltk.tokenize.stanford_segmenter import StanfordSegmenter
+from nltk.tokenize.texttiling import TextTilingTokenizer
+from nltk.tokenize.toktok import ToktokTokenizer
+from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer
+from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize
+
+
+@functools.lru_cache
+def _get_punkt_tokenizer(language="english"):
+    """
+    A constructor for the PunktTokenizer that utilizes
+    a lru cache for performance.
+
+    :param language: the model name in the Punkt corpus
+    :type language: str
+    """
+    return PunktTokenizer(language)
+
+
+# Standard sentence tokenizer.
+def sent_tokenize(text, language="english"):
+    """
+    Return a sentence-tokenized copy of *text*,
+    using NLTK's recommended sentence tokenizer
+    (currently :class:`.PunktSentenceTokenizer`
+    for the specified language).
+
+    :param text: text to split into sentences
+    :param language: the model name in the Punkt corpus
+    """
+    tokenizer = _get_punkt_tokenizer(language)
+    return tokenizer.tokenize(text)
+
+
+# Standard word tokenizer.
+_treebank_word_tokenizer = NLTKWordTokenizer()
+
+
+def word_tokenize(text, language="english", preserve_line=False):
+    """
+    Return a tokenized copy of *text*,
+    using NLTK's recommended word tokenizer
+    (currently an improved :class:`.TreebankWordTokenizer`
+    along with :class:`.PunktSentenceTokenizer`
+    for the specified language).
+
+    :param text: text to split into words
+    :type text: str
+    :param language: the model name in the Punkt corpus
+    :type language: str
+    :param preserve_line: A flag to decide whether to sentence tokenize the text or not.
+    :type preserve_line: bool
+    """
+    sentences = [text] if preserve_line else sent_tokenize(text, language)
+    return [
+        token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
+    ]
--- a/backend/venv/Lib/site-packages/nltk/tokenize/api.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/api.py
@@ -0,0 +1,83 @@
+# Natural Language Toolkit: Tokenizer Interface
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Tokenizer Interface
+"""
+
+from abc import ABC, abstractmethod
+from typing import Iterator, List, Tuple
+
+from nltk.internals import overridden
+from nltk.tokenize.util import string_span_tokenize
+
+
+class TokenizerI(ABC):
+    """
+    A processing interface for tokenizing a string.
+    Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
+    """
+
+    @abstractmethod
+    def tokenize(self, s: str) -> List[str]:
+        """
+        Return a tokenized copy of *s*.
+
+        :rtype: List[str]
+        """
+        if overridden(self.tokenize_sents):
+            return self.tokenize_sents([s])[0]
+
+    def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
+        """
+        Identify the tokens using integer offsets ``(start_i, end_i)``,
+        where ``s[start_i:end_i]`` is the corresponding token.
+
+        :rtype: Iterator[Tuple[int, int]]
+        """
+        raise NotImplementedError()
+
+    def tokenize_sents(self, strings: List[str]) -> List[List[str]]:
+        """
+        Apply ``self.tokenize()`` to each element of ``strings``.  I.e.:
+
+            return [self.tokenize(s) for s in strings]
+
+        :rtype: List[List[str]]
+        """
+        return [self.tokenize(s) for s in strings]
+
+    def span_tokenize_sents(
+        self, strings: List[str]
+    ) -> Iterator[List[Tuple[int, int]]]:
+        """
+        Apply ``self.span_tokenize()`` to each element of ``strings``.  I.e.:
+
+            return [self.span_tokenize(s) for s in strings]
+
+        :yield: List[Tuple[int, int]]
+        """
+        for s in strings:
+            yield list(self.span_tokenize(s))
+
+
+class StringTokenizer(TokenizerI):
+    """A tokenizer that divides a string into substrings by splitting
+    on the specified string (defined in subclasses).
+    """
+
+    @property
+    @abstractmethod
+    def _string(self):
+        raise NotImplementedError
+
+    def tokenize(self, s):
+        return s.split(self._string)
+
+    def span_tokenize(self, s):
+        yield from string_span_tokenize(s, self._string)
--- a/backend/venv/Lib/site-packages/nltk/tokenize/casual.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/casual.py
@@ -0,0 +1,458 @@
+#
+# Natural Language Toolkit: Twitter Tokenizer
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Christopher Potts <cgpotts@stanford.edu>
+#         Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
+#         Pierpaolo Pantone <> (modifications)
+#         Tom Aarsen <> (modifications)
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+
+"""
+Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
+domains and tasks. The basic logic is this:
+
+1. The tuple REGEXPS defines a list of regular expression
+   strings.
+
+2. The REGEXPS strings are put, in order, into a compiled
+   regular expression object called WORD_RE, under the TweetTokenizer
+   class.
+
+3. The tokenization is done by WORD_RE.findall(s), where s is the
+   user-supplied string, inside the tokenize() method of the class
+   TweetTokenizer.
+
+4. When instantiating Tokenizer objects, there are several options:
+    * preserve_case. By default, it is set to True. If it is set to
+      False, then the tokenizer will downcase everything except for
+      emoticons.
+    * reduce_len. By default, it is set to False. It specifies whether
+      to replace repeated character sequences of length 3 or greater
+      with sequences of length 3.
+    * strip_handles. By default, it is set to False. It specifies
+      whether to remove Twitter handles of text used in the
+      `tokenize` method.
+    * match_phone_numbers. By default, it is set to True. It indicates
+      whether the `tokenize` method should look for phone numbers.
+"""
+
+
+######################################################################
+
+import html
+from typing import List
+
+import regex  # https://github.com/nltk/nltk/issues/2409
+
+from nltk.tokenize.api import TokenizerI
+
+######################################################################
+# The following strings are components in the regular expression
+# that is used for tokenizing. It's important that phone_number
+# appears first in the final regex (since it can contain whitespace).
+# It also could matter that tags comes after emoticons, due to the
+# possibility of having text like
+#
+#     <:| and some text >:)
+#
+# Most importantly, the final element should always be last, since it
+# does a last ditch whitespace-based tokenization of whatever is left.
+
+# ToDo: Update with https://en.wikipedia.org/wiki/List_of_emoticons ?
+
+# This particular element is used in a couple ways, so we define it
+# with a name:
+EMOTICONS = r"""
+    (?:
+      [<>]?
+      [:;=8]                     # eyes
+      [\-o\*\']?                 # optional nose
+      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
+      |
+      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
+      [\-o\*\']?                 # optional nose
+      [:;=8]                     # eyes
+      [<>]?
+      |
+      </?3                       # heart
+    )"""
+
+# URL pattern due to John Gruber, modified by Tom Winzig. See
+# https://gist.github.com/winzig/8894715
+
+URLS = r"""			# Capture 1: entire matched URL
+  (?:
+  https?:				# URL protocol and colon
+    (?:
+      /{1,3}				# 1-3 slashes
+      |					#   or
+      [a-z0-9%]				# Single letter or digit or '%'
+                                       # (Trying not to match e.g. "URI::Escape")
+    )
+    |					#   or
+                                       # looks like domain name followed by a slash:
+    [a-z0-9.\-]+[.]
+    (?:[a-z]{2,13})
+    /
+  )
+  (?:					# One or more:
+    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]
+    |					#   or
+    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
+    |
+    \([^\s]+?\)				# balanced parens, non-recursive: (...)
+  )+
+  (?:					# End with:
+    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
+    |
+    \([^\s]+?\)				# balanced parens, non-recursive: (...)
+    |					#   or
+    [^\s`!()\[\]{};:'".,<>?«»“”‘’]	# not a space or one of these punct chars
+  )
+  |					# OR, the following to match naked domains:
+  (?:
+  	(?<!@)			        # not preceded by a @, avoid matching foo@_gmail.com_
+    [a-z0-9]+
+    (?:[.\-][a-z0-9]+)*
+    [.]
+    (?:[a-z]{2,13})
+    \b
+    /?
+    (?!@)			        # not succeeded by a @,
+                            # avoid matching "foo.na" in "foo.na@example.com"
+  )
+"""
+
+# emoji flag sequence
+# https://en.wikipedia.org/wiki/Regional_indicator_symbol
+# For regex simplicity, include all possible enclosed letter pairs,
+# not the ISO subset of two-letter regional indicator symbols.
+# See https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Current_codes
+# Future regional flag support may be handled with the regex for
+# U+1F3F4 🏴 followed by emoji tag sequences:
+# r'\U0001F3F4[\U000E0000-\U000E007E]{5}\U000E007F'
+FLAGS = r"""
+  (?:
+    [\U0001F1E6-\U0001F1FF]{2}  # all enclosed letter pairs
+    |
+    # English flag
+    \U0001F3F4\U000E0067\U000E0062\U000E0065\U000E006e\U000E0067\U000E007F
+    |
+    # Scottish flag
+    \U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007F
+    |
+    # For Wales? Why Richard, it profit a man nothing to give his soul for the whole world … but for Wales!
+    \U0001F3F4\U000E0067\U000E0062\U000E0077\U000E006C\U000E0073\U000E007F
+  )
+"""
+
+# Regex for recognizing phone numbers:
+PHONE_REGEX = r"""
+    (?:
+      (?:            # (international)
+        \+?[01]
+        [ *\-.\)]*
+      )?
+      (?:            # (area code)
+        [\(]?
+        \d{3}
+        [ *\-.\)]*
+      )?
+      \d{3}          # exchange
+      [ *\-.\)]*
+      \d{4}          # base
+    )"""
+
+# The components of the tokenizer:
+REGEXPS = (
+    URLS,
+    # ASCII Emoticons
+    EMOTICONS,
+    # HTML tags:
+    r"""<[^>\s]+>""",
+    # ASCII Arrows
+    r"""[\-]+>|<[\-]+""",
+    # Twitter username:
+    r"""(?:@[\w_]+)""",
+    # Twitter hashtags:
+    r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
+    # email addresses
+    r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
+    # Zero-Width-Joiner and Skin tone modifier emojis
+    """.(?:
+        [\U0001F3FB-\U0001F3FF]?(?:\u200d.[\U0001F3FB-\U0001F3FF]?)+
+        |
+        [\U0001F3FB-\U0001F3FF]
+    )""",
+    # flags
+    FLAGS,
+    # Remaining word types:
+    r"""
+    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
+    |
+    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
+    |
+    (?:[\w_]+)                     # Words without apostrophes or dashes.
+    |
+    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
+    |
+    (?:\S)                         # Everything else that isn't whitespace.
+    """,
+)
+
+# Take the main components and add a phone regex as the second parameter
+REGEXPS_PHONE = (REGEXPS[0], PHONE_REGEX, *REGEXPS[1:])
+
+######################################################################
+# TweetTokenizer.WORD_RE and TweetTokenizer.PHONE_WORD_RE represent
+# the core tokenizing regexes. They are compiled lazily.
+
+# WORD_RE performs poorly on these patterns:
+HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")
+
+# The emoticon string gets its own regex so that we can preserve case for
+# them as needed:
+EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)
+
+# These are for regularizing HTML entities to Unicode:
+ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
+
+# For stripping away handles from a tweet:
+HANDLES_RE = regex.compile(
+    r"(?<![A-Za-z0-9_!@#\$%&*])@"
+    r"(([A-Za-z0-9_]){15}(?!@)|([A-Za-z0-9_]){1,14}(?![A-Za-z0-9_]*@))"
+)
+
+
+######################################################################
+# Functions for converting html entities
+######################################################################
+
+
+def _str_to_unicode(text, encoding=None, errors="strict"):
+    if encoding is None:
+        encoding = "utf-8"
+    if isinstance(text, bytes):
+        return text.decode(encoding, errors)
+    return text
+
+
+def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
+    """
+    Remove entities from text by converting them to their
+    corresponding unicode character.
+
+    :param text: a unicode string or a byte string encoded in the given
+    `encoding` (which defaults to 'utf-8').
+
+    :param list keep:  list of entity names which should not be replaced.\
+    This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
+    and named entities (such as ``&nbsp;`` or ``&gt;``).
+
+    :param bool remove_illegal: If `True`, entities that can't be converted are\
+    removed. Otherwise, entities that can't be converted are kept "as
+    is".
+
+    :returns: A unicode string with the entities removed.
+
+    See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
+
+        >>> from nltk.tokenize.casual import _replace_html_entities
+        >>> _replace_html_entities(b'Price: &pound;100')
+        'Price: \\xa3100'
+        >>> print(_replace_html_entities(b'Price: &pound;100'))
+        Price: £100
+        >>>
+    """
+
+    def _convert_entity(match):
+        entity_body = match.group(3)
+        if match.group(1):
+            try:
+                if match.group(2):
+                    number = int(entity_body, 16)
+                else:
+                    number = int(entity_body, 10)
+                # Numeric character references in the 80-9F range are typically
+                # interpreted by browsers as representing the characters mapped
+                # to bytes 80-9F in the Windows-1252 encoding. For more info
+                # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets
+                if 0x80 <= number <= 0x9F:
+                    return bytes((number,)).decode("cp1252")
+            except ValueError:
+                number = None
+        else:
+            if entity_body in keep:
+                return match.group(0)
+            number = html.entities.name2codepoint.get(entity_body)
+        if number is not None:
+            try:
+                return chr(number)
+            except (ValueError, OverflowError):
+                pass
+
+        return "" if remove_illegal else match.group(0)
+
+    return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))
+
+
+######################################################################
+
+
+class TweetTokenizer(TokenizerI):
+    r"""
+    Tokenizer for tweets.
+
+        >>> from nltk.tokenize import TweetTokenizer
+        >>> tknzr = TweetTokenizer()
+        >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
+        >>> tknzr.tokenize(s0) # doctest: +NORMALIZE_WHITESPACE
+        ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->',
+         '<--']
+
+    Examples using `strip_handles` and `reduce_len parameters`:
+
+        >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
+        >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
+        >>> tknzr.tokenize(s1)
+        [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
+    """
+
+    # Values used to lazily compile WORD_RE and PHONE_WORD_RE,
+    # which are the core tokenizing regexes.
+    _WORD_RE = None
+    _PHONE_WORD_RE = None
+
+    ######################################################################
+
+    def __init__(
+        self,
+        preserve_case=True,
+        reduce_len=False,
+        strip_handles=False,
+        match_phone_numbers=True,
+    ):
+        """
+        Create a `TweetTokenizer` instance with settings for use in the `tokenize` method.
+
+        :param preserve_case: Flag indicating whether to preserve the casing (capitalisation)
+            of text used in the `tokenize` method. Defaults to True.
+        :type preserve_case: bool
+        :param reduce_len: Flag indicating whether to replace repeated character sequences
+            of length 3 or greater with sequences of length 3. Defaults to False.
+        :type reduce_len: bool
+        :param strip_handles: Flag indicating whether to remove Twitter handles of text used
+            in the `tokenize` method. Defaults to False.
+        :type strip_handles: bool
+        :param match_phone_numbers: Flag indicating whether the `tokenize` method should look
+            for phone numbers. Defaults to True.
+        :type match_phone_numbers: bool
+        """
+        self.preserve_case = preserve_case
+        self.reduce_len = reduce_len
+        self.strip_handles = strip_handles
+        self.match_phone_numbers = match_phone_numbers
+
+    def tokenize(self, text: str) -> List[str]:
+        """Tokenize the input text.
+
+        :param text: str
+        :rtype: list(str)
+        :return: a tokenized list of strings; joining this list returns\
+        the original string if `preserve_case=False`.
+        """
+        # Fix HTML character entities:
+        text = _replace_html_entities(text)
+        # Remove username handles
+        if self.strip_handles:
+            text = remove_handles(text)
+        # Normalize word lengthening
+        if self.reduce_len:
+            text = reduce_lengthening(text)
+        # Shorten problematic sequences of characters
+        safe_text = HANG_RE.sub(r"\1\1\1", text)
+        # Recognise phone numbers during tokenization
+        if self.match_phone_numbers:
+            words = self.PHONE_WORD_RE.findall(safe_text)
+        else:
+            words = self.WORD_RE.findall(safe_text)
+        # Possibly alter the case, but avoid changing emoticons like :D into :d:
+        if not self.preserve_case:
+            words = list(
+                map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
+            )
+        return words
+
+    @property
+    def WORD_RE(self) -> "regex.Pattern":
+        """Core TweetTokenizer regex"""
+        # Compiles the regex for this and all future instantiations of TweetTokenizer.
+        if not type(self)._WORD_RE:
+            type(self)._WORD_RE = regex.compile(
+                f"({'|'.join(REGEXPS)})",
+                regex.VERBOSE | regex.I | regex.UNICODE,
+            )
+        return type(self)._WORD_RE
+
+    @property
+    def PHONE_WORD_RE(self) -> "regex.Pattern":
+        """Secondary core TweetTokenizer regex"""
+        # Compiles the regex for this and all future instantiations of TweetTokenizer.
+        if not type(self)._PHONE_WORD_RE:
+            type(self)._PHONE_WORD_RE = regex.compile(
+                f"({'|'.join(REGEXPS_PHONE)})",
+                regex.VERBOSE | regex.I | regex.UNICODE,
+            )
+        return type(self)._PHONE_WORD_RE
+
+
+######################################################################
+# Normalization Functions
+######################################################################
+
+
+def reduce_lengthening(text):
+    """
+    Replace repeated character sequences of length 3 or greater with sequences
+    of length 3.
+    """
+    pattern = regex.compile(r"(.)\1{2,}")
+    return pattern.sub(r"\1\1\1", text)
+
+
+def remove_handles(text):
+    """
+    Remove Twitter username handles from text.
+    """
+    # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
+    return HANDLES_RE.sub(" ", text)
+
+
+######################################################################
+# Tokenization Function
+######################################################################
+
+
+def casual_tokenize(
+    text,
+    preserve_case=True,
+    reduce_len=False,
+    strip_handles=False,
+    match_phone_numbers=True,
+):
+    """
+    Convenience function for wrapping the tokenizer.
+    """
+    return TweetTokenizer(
+        preserve_case=preserve_case,
+        reduce_len=reduce_len,
+        strip_handles=strip_handles,
+        match_phone_numbers=match_phone_numbers,
+    ).tokenize(text)
+
+
+###############################################################################
--- a/backend/venv/Lib/site-packages/nltk/tokenize/destructive.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/destructive.py
@@ -0,0 +1,234 @@
+# Natural Language Toolkit: NLTK's very own tokenizer.
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Liling Tan
+#         Tom Aarsen <> (modifications)
+# URL: <https://www.nltk.org>
+# For license information, see LICENSE.TXT
+
+
+import re
+import warnings
+from typing import Iterator, List, Tuple
+
+from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import align_tokens
+
+
+class MacIntyreContractions:
+    """
+    List of contractions adapted from Robert MacIntyre's tokenizer.
+    """
+
+    CONTRACTIONS2 = [
+        r"(?i)\b(can)(?#X)(not)\b",
+        r"(?i)\b(d)(?#X)('ye)\b",
+        r"(?i)\b(gim)(?#X)(me)\b",
+        r"(?i)\b(gon)(?#X)(na)\b",
+        r"(?i)\b(got)(?#X)(ta)\b",
+        r"(?i)\b(lem)(?#X)(me)\b",
+        r"(?i)\b(more)(?#X)('n)\b",
+        r"(?i)\b(wan)(?#X)(na)(?=\s)",
+    ]
+    CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
+    CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
+
+
+class NLTKWordTokenizer(TokenizerI):
+    """
+    The NLTK tokenizer that has improved upon the TreebankWordTokenizer.
+
+    This is the method that is invoked by ``word_tokenize()``.  It assumes that the
+    text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
+
+    The tokenizer is "destructive" such that the regexes applied will munge the
+    input string to a state beyond re-construction. It is possible to apply
+    `TreebankWordDetokenizer.detokenize` to the tokenized outputs of
+    `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
+    revert to the original string.
+    """
+
+    # Starting quotes.
+    STARTING_QUOTES = [
+        (re.compile("([«“‘„]|[`]+)", re.U), r" \1 "),
+        (re.compile(r"^\""), r"``"),
+        (re.compile(r"(``)"), r" \1 "),
+        (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
+        (re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b", re.U), r"\1 \2"),
+    ]
+
+    # Ending quotes.
+    ENDING_QUOTES = [
+        (re.compile("([»”’])", re.U), r" \1 "),
+        (re.compile(r"''"), " '' "),
+        (re.compile(r'"'), " '' "),
+        (re.compile(r"\s+"), " "),
+        (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
+        (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
+    ]
+
+    # For improvements for starting/closing quotes from TreebankWordTokenizer,
+    # see discussion on https://github.com/nltk/nltk/pull/1437
+    # Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
+    # - chevron quotes u'\xab' and u'\xbb'
+    # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
+    # See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
+    # Also, behavior of splitting on clitics now follows Stanford CoreNLP
+    # - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
+
+    # Punctuation.
+    PUNCTUATION = [
+        (re.compile(r'([^\.])(\.)([\]\)}>"\'' "»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "),
+        (re.compile(r"([:,])([^\d])"), r" \1 \2"),
+        (re.compile(r"([:,])$"), r" \1 "),
+        (
+            re.compile(r"\.{2,}", re.U),
+            r" \g<0> ",
+        ),  # See https://github.com/nltk/nltk/pull/2322
+        (re.compile(r"[;@#$%&]"), r" \g<0> "),
+        (
+            re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
+            r"\1 \2\3 ",
+        ),  # Handles the final period.
+        (re.compile(r"[?!]"), r" \g<0> "),
+        (re.compile(r"([^'])' "), r"\1 ' "),
+        (
+            re.compile(r"[*]", re.U),
+            r" \g<0> ",
+        ),  # See https://github.com/nltk/nltk/pull/2322
+    ]
+
+    # Pads parentheses
+    PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
+
+    # Optionally: Convert parentheses, brackets and converts them to PTB symbols.
+    CONVERT_PARENTHESES = [
+        (re.compile(r"\("), "-LRB-"),
+        (re.compile(r"\)"), "-RRB-"),
+        (re.compile(r"\["), "-LSB-"),
+        (re.compile(r"\]"), "-RSB-"),
+        (re.compile(r"\{"), "-LCB-"),
+        (re.compile(r"\}"), "-RCB-"),
+    ]
+
+    DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
+
+    # List of contractions adapted from Robert MacIntyre's tokenizer.
+    _contractions = MacIntyreContractions()
+    CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
+    CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
+
+    def tokenize(
+        self, text: str, convert_parentheses: bool = False, return_str: bool = False
+    ) -> List[str]:
+        r"""Return a tokenized copy of `text`.
+
+        >>> from nltk.tokenize import NLTKWordTokenizer
+        >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
+        >>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
+        ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
+        'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
+        'of', 'them.', 'Thanks', '.']
+        >>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
+        ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
+        'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
+        'of', 'them.', 'Thanks', '.']
+
+
+        :param text: A string with a sentence or sentences.
+        :type text: str
+        :param convert_parentheses: if True, replace parentheses to PTB symbols,
+            e.g. `(` to `-LRB-`. Defaults to False.
+        :type convert_parentheses: bool, optional
+        :param return_str: If True, return tokens as space-separated string,
+            defaults to False.
+        :type return_str: bool, optional
+        :return: List of tokens from `text`.
+        :rtype: List[str]
+        """
+        if return_str:
+            warnings.warn(
+                "Parameter 'return_str' has been deprecated and should no "
+                "longer be used.",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+
+        for regexp, substitution in self.STARTING_QUOTES:
+            text = regexp.sub(substitution, text)
+
+        for regexp, substitution in self.PUNCTUATION:
+            text = regexp.sub(substitution, text)
+
+        # Handles parentheses.
+        regexp, substitution = self.PARENS_BRACKETS
+        text = regexp.sub(substitution, text)
+        # Optionally convert parentheses
+        if convert_parentheses:
+            for regexp, substitution in self.CONVERT_PARENTHESES:
+                text = regexp.sub(substitution, text)
+
+        # Handles double dash.
+        regexp, substitution = self.DOUBLE_DASHES
+        text = regexp.sub(substitution, text)
+
+        # add extra space to make things easier
+        text = " " + text + " "
+
+        for regexp, substitution in self.ENDING_QUOTES:
+            text = regexp.sub(substitution, text)
+
+        for regexp in self.CONTRACTIONS2:
+            text = regexp.sub(r" \1 \2 ", text)
+        for regexp in self.CONTRACTIONS3:
+            text = regexp.sub(r" \1 \2 ", text)
+
+        # We are not using CONTRACTIONS4 since
+        # they are also commented out in the SED scripts
+        # for regexp in self._contractions.CONTRACTIONS4:
+        #     text = regexp.sub(r' \1 \2 \3 ', text)
+
+        return text.split()
+
+    def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
+        r"""
+        Returns the spans of the tokens in ``text``.
+        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
+
+            >>> from nltk.tokenize import NLTKWordTokenizer
+            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
+            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
+            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
+            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
+            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
+            >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
+            True
+            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
+            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
+            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
+            >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
+            True
+
+        :param text: A string with a sentence or sentences.
+        :type text: str
+        :yield: Tuple[int, int]
+        """
+        raw_tokens = self.tokenize(text)
+
+        # Convert converted quotes back to original double quotes
+        # Do this only if original text contains double quote(s) or double
+        # single-quotes (because '' might be transformed to `` if it is
+        # treated as starting quotes).
+        if ('"' in text) or ("''" in text):
+            # Find double quotes and converted quotes
+            matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
+
+            # Replace converted quotes back to double quotes
+            tokens = [
+                matched.pop(0) if tok in ['"', "``", "''"] else tok
+                for tok in raw_tokens
+            ]
+        else:
+            tokens = raw_tokens
+
+        yield from align_tokens(tokens, text)
--- a/backend/venv/Lib/site-packages/nltk/tokenize/legality_principle.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/legality_principle.py
@@ -0,0 +1,147 @@
+# Natural Language Toolkit: Tokenizers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Christopher Hench <chris.l.hench@gmail.com>
+#         Alex Estes
+# URL: <https://www.nltk.org>
+# For license information, see LICENSE.TXT
+
+"""
+The Legality Principle is a language agnostic principle maintaining that syllable
+onsets and codas (the beginning and ends of syllables not including the vowel)
+are only legal if they are found as word onsets or codas in the language. The English
+word ''admit'' must then be syllabified as ''ad-mit'' since ''dm'' is not found
+word-initially in the English language (Bartlett et al.). This principle was first proposed
+in Daniel Kahn's 1976 dissertation, ''Syllable-based generalizations in English phonology''.
+
+Kahn further argues that there is a ''strong tendency to syllabify in such a way that
+initial clusters are of maximal length, consistent with the general constraints on
+word-initial consonant clusters.'' Consequently, in addition to being legal onsets,
+the longest legal onset is preferable---''Onset Maximization''.
+
+The default implementation assumes an English vowel set, but the `vowels` attribute
+can be set to IPA or any other alphabet's vowel set for the use-case.
+Both a valid set of vowels as well as a text corpus of words in the language
+are necessary to determine legal onsets and subsequently syllabify words.
+
+The legality principle with onset maximization is a universal syllabification algorithm,
+but that does not mean it performs equally across languages. Bartlett et al. (2009)
+is a good benchmark for English accuracy if utilizing IPA (pg. 311).
+
+References:
+
+- Otto Jespersen. 1904. Lehrbuch der Phonetik.
+  Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
+- Theo Vennemann, ''On the Theory of Syllabic Phonology,'' 1972, p. 11.
+- Daniel Kahn, ''Syllable-based generalizations in English phonology'', (PhD diss., MIT, 1976).
+- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
+  In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
+  Cambridge, MIT Press. pp. 107-136.
+- Jeremy Goslin and Ulrich Frauenfelder. 2001. A comparison of theoretical and human syllabification. Language and Speech, 44:409–436.
+- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
+  In HLT-NAACL. pp. 308-316.
+- Christopher Hench. 2017. Resonances in Middle High German: New Methodologies in Prosody. UC Berkeley.
+"""
+
+from collections import Counter
+
+from nltk.tokenize.api import TokenizerI
+
+
+class LegalitySyllableTokenizer(TokenizerI):
+    """
+    Syllabifies words based on the Legality Principle and Onset Maximization.
+
+        >>> from nltk.tokenize import LegalitySyllableTokenizer
+        >>> from nltk import word_tokenize
+        >>> from nltk.corpus import words
+        >>> text = "This is a wonderful sentence."
+        >>> text_words = word_tokenize(text)
+        >>> LP = LegalitySyllableTokenizer(words.words())
+        >>> [LP.tokenize(word) for word in text_words]
+        [['This'], ['is'], ['a'], ['won', 'der', 'ful'], ['sen', 'ten', 'ce'], ['.']]
+    """
+
+    def __init__(
+        self, tokenized_source_text, vowels="aeiouy", legal_frequency_threshold=0.001
+    ):
+        """
+        :param tokenized_source_text: List of valid tokens in the language
+        :type tokenized_source_text: list(str)
+        :param vowels: Valid vowels in language or IPA representation
+        :type vowels: str
+        :param legal_frequency_threshold: Lowest frequency of all onsets to be considered a legal onset
+        :type legal_frequency_threshold: float
+        """
+        self.legal_frequency_threshold = legal_frequency_threshold
+        self.vowels = vowels
+        self.legal_onsets = self.find_legal_onsets(tokenized_source_text)
+
+    def find_legal_onsets(self, words):
+        """
+        Gathers all onsets and then return only those above the frequency threshold
+
+        :param words: List of words in a language
+        :type words: list(str)
+        :return: Set of legal onsets
+        :rtype: set(str)
+        """
+        onsets = [self.onset(word) for word in words]
+        legal_onsets = [
+            k
+            for k, v in Counter(onsets).items()
+            if (v / len(onsets)) > self.legal_frequency_threshold
+        ]
+        return set(legal_onsets)
+
+    def onset(self, word):
+        """
+        Returns consonant cluster of word, i.e. all characters until the first vowel.
+
+        :param word: Single word or token
+        :type word: str
+        :return: String of characters of onset
+        :rtype: str
+        """
+        onset = ""
+        for c in word.lower():
+            if c in self.vowels:
+                return onset
+            else:
+                onset += c
+        return onset
+
+    def tokenize(self, token):
+        """
+        Apply the Legality Principle in combination with
+        Onset Maximization to return a list of syllables.
+
+        :param token: Single word or token
+        :type token: str
+        :return syllable_list: Single word or token broken up into syllables.
+        :rtype: list(str)
+        """
+        syllables = []
+        syllable, current_onset = "", ""
+        vowel, onset = False, False
+        for char in token[::-1]:
+            char_lower = char.lower()
+            if not vowel:
+                syllable += char
+                vowel = bool(char_lower in self.vowels)
+            else:
+                if char_lower + current_onset[::-1] in self.legal_onsets:
+                    syllable += char
+                    current_onset += char_lower
+                    onset = True
+                elif char_lower in self.vowels and not onset:
+                    syllable += char
+                    current_onset += char_lower
+                else:
+                    syllables.append(syllable)
+                    syllable = char
+                    current_onset = ""
+                    vowel = bool(char_lower in self.vowels)
+        syllables.append(syllable)
+        syllables_ordered = [syllable[::-1] for syllable in syllables][::-1]
+        return syllables_ordered
--- a/backend/venv/Lib/site-packages/nltk/tokenize/mwe.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/mwe.py
@@ -0,0 +1,124 @@
+# Multi-Word Expression tokenizer
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Rob Malouf <rmalouf@mail.sdsu.edu>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Multi-Word Expression Tokenizer
+
+A ``MWETokenizer`` takes a string which has already been divided into tokens and
+retokenizes it, merging multi-word expressions into single tokens, using a lexicon
+of MWEs:
+
+
+    >>> from nltk.tokenize import MWETokenizer
+
+    >>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
+    >>> tokenizer.add_mwe(('in', 'spite', 'of'))
+
+    >>> tokenizer.tokenize('Testing testing testing one two three'.split())
+    ['Testing', 'testing', 'testing', 'one', 'two', 'three']
+
+    >>> tokenizer.tokenize('This is a test in spite'.split())
+    ['This', 'is', 'a', 'test', 'in', 'spite']
+
+    >>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
+    ['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']
+
+"""
+from nltk.tokenize.api import TokenizerI
+from nltk.util import Trie
+
+
+class MWETokenizer(TokenizerI):
+    """A tokenizer that processes tokenized text and merges multi-word expressions
+    into single tokens.
+    """
+
+    def __init__(self, mwes=None, separator="_"):
+        """Initialize the multi-word tokenizer with a list of expressions and a
+        separator
+
+        :type mwes: list(list(str))
+        :param mwes: A sequence of multi-word expressions to be merged, where
+            each MWE is a sequence of strings.
+        :type separator: str
+        :param separator: String that should be inserted between words in a multi-word
+            expression token. (Default is '_')
+
+        """
+        if not mwes:
+            mwes = []
+        self._mwes = Trie(mwes)
+        self._separator = separator
+
+    def add_mwe(self, mwe):
+        """Add a multi-word expression to the lexicon (stored as a word trie)
+
+        We use ``util.Trie`` to represent the trie. Its form is a dict of dicts.
+        The key True marks the end of a valid MWE.
+
+        :param mwe: The multi-word expression we're adding into the word trie
+        :type mwe: tuple(str) or list(str)
+
+        :Example:
+
+        >>> tokenizer = MWETokenizer()
+        >>> tokenizer.add_mwe(('a', 'b'))
+        >>> tokenizer.add_mwe(('a', 'b', 'c'))
+        >>> tokenizer.add_mwe(('a', 'x'))
+        >>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
+        >>> tokenizer._mwes == expected
+        True
+
+        """
+        self._mwes.insert(mwe)
+
+    def tokenize(self, text):
+        """
+
+        :param text: A list containing tokenized text
+        :type text: list(str)
+        :return: A list of the tokenized text with multi-words merged together
+        :rtype: list(str)
+
+        :Example:
+
+        >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
+        >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
+        ['An', "hors+d'oeuvre", 'tonight,', 'sir?']
+
+        """
+        i = 0
+        n = len(text)
+        result = []
+
+        while i < n:
+            if text[i] in self._mwes:
+                # possible MWE match
+                j = i
+                trie = self._mwes
+                last_match = -1
+                while j < n and text[j] in trie:  # and len(trie[text[j]]) > 0 :
+                    trie = trie[text[j]]
+                    j = j + 1
+                    if Trie.LEAF in trie:
+                        last_match = j
+                else:
+                    if last_match > -1:
+                        j = last_match
+
+                    if Trie.LEAF in trie or last_match > -1:
+                        # success!
+                        result.append(self._separator.join(text[i:j]))
+                        i = j
+                    else:
+                        # no match, so backtrack
+                        result.append(text[i])
+                        i += 1
+            else:
+                result.append(text[i])
+                i += 1
+        return result
--- a/backend/venv/Lib/site-packages/nltk/tokenize/nist.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/nist.py
@@ -0,0 +1,179 @@
+# Natural Language Toolkit: Python port of the mteval-v14.pl tokenizer.
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Liling Tan (ported from ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v14.pl)
+# Contributors: Ozan Caglayan, Wiktor Stribizew
+#
+# URL: <https://www.nltk.org>
+# For license information, see LICENSE.TXT
+
+"""
+This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
+https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
+which was also ported into Python in
+https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
+"""
+
+
+import io
+import re
+
+from nltk.corpus import perluniprops
+from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import xml_unescape
+
+
+class NISTTokenizer(TokenizerI):
+    """
+    This NIST tokenizer is sentence-based instead of the original
+    paragraph-based tokenization from mteval-14.pl; The sentence-based
+    tokenization is consistent with the other tokenizers available in NLTK.
+
+    >>> from nltk.tokenize.nist import NISTTokenizer
+    >>> nist = NISTTokenizer()
+    >>> s = "Good muffins cost $3.88 in New York."
+    >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
+    >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
+    >>> nist.tokenize(s, lowercase=False) == expected_cased
+    True
+    >>> nist.tokenize(s, lowercase=True) == expected_lower  # Lowercased.
+    True
+
+    The international_tokenize() is the preferred function when tokenizing
+    non-european text, e.g.
+
+    >>> from nltk.tokenize.nist import NISTTokenizer
+    >>> nist = NISTTokenizer()
+
+    # Input strings.
+    >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) is a Chinese e-commerce company...'
+    >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
+    >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'
+
+    # Expected tokens.
+    >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')']
+    >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm']
+    >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha']
+
+    >>> nist.international_tokenize(albb)[:10] == expected_albb
+    True
+    >>> nist.international_tokenize(amz)[:10] == expected_amz
+    True
+    >>> nist.international_tokenize(rkt)[:10] == expected_rkt
+    True
+
+    # Doctest for patching issue #1926
+    >>> sent = u'this is a foo\u2604sentence.'
+    >>> expected_sent = [u'this', u'is', u'a', u'foo', u'\u2604', u'sentence', u'.']
+    >>> nist.international_tokenize(sent) == expected_sent
+    True
+    """
+
+    # Strip "skipped" tags
+    STRIP_SKIP = re.compile("<skipped>"), ""
+    #  Strip end-of-line hyphenation and join lines
+    STRIP_EOL_HYPHEN = re.compile("\u2028"), " "
+    # Tokenize punctuation.
+    PUNCT = re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 "
+    # Tokenize period and comma unless preceded by a digit.
+    PERIOD_COMMA_PRECEED = re.compile(r"([^0-9])([\.,])"), "\\1 \\2 "
+    # Tokenize period and comma unless followed by a digit.
+    PERIOD_COMMA_FOLLOW = re.compile(r"([\.,])([^0-9])"), " \\1 \\2"
+    # Tokenize dash when preceded by a digit
+    DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 "
+
+    LANG_DEPENDENT_REGEXES = [
+        PUNCT,
+        PERIOD_COMMA_PRECEED,
+        PERIOD_COMMA_FOLLOW,
+        DASH_PRECEED_DIGIT,
+    ]
+
+    # Perluniprops characters used in NIST tokenizer.
+    pup_number = str("".join(set(perluniprops.chars("Number"))))  # i.e. \p{N}
+    pup_punct = str("".join(set(perluniprops.chars("Punctuation"))))  # i.e. \p{P}
+    pup_symbol = str("".join(set(perluniprops.chars("Symbol"))))  # i.e. \p{S}
+
+    # Python regexes needs to escape some special symbols, see
+    # see https://stackoverflow.com/q/45670950/610569
+    number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number)
+    punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct)
+    symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol)
+
+    # Note: In the original perl implementation, \p{Z} and \p{Zl} were used to
+    #       (i) strip trailing and heading spaces  and
+    #       (ii) de-deuplicate spaces.
+    #       In Python, this would do: ' '.join(str.strip().split())
+    # Thus, the next two lines were commented out.
+    # Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
+    # Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
+
+    # Pads non-ascii strings with space.
+    NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 "
+    #  Tokenize any punctuation unless followed AND preceded by a digit.
+    PUNCT_1 = (
+        re.compile(f"([{number_regex}])([{punct_regex}])"),
+        "\\1 \\2 ",
+    )
+    PUNCT_2 = (
+        re.compile(f"([{punct_regex}])([{number_regex}])"),
+        " \\1 \\2",
+    )
+    # Tokenize symbols
+    SYMBOLS = re.compile(f"([{symbol_regex}])"), " \\1 "
+
+    INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]
+
+    def lang_independent_sub(self, text):
+        """Performs the language independent string substituitions."""
+        # It's a strange order of regexes.
+        # It'll be better to unescape after STRIP_EOL_HYPHEN
+        # but let's keep it close to the original NIST implementation.
+        regexp, substitution = self.STRIP_SKIP
+        text = regexp.sub(substitution, text)
+        text = xml_unescape(text)
+        regexp, substitution = self.STRIP_EOL_HYPHEN
+        text = regexp.sub(substitution, text)
+        return text
+
+    def tokenize(self, text, lowercase=False, western_lang=True, return_str=False):
+        text = str(text)
+        # Language independent regex.
+        text = self.lang_independent_sub(text)
+        # Language dependent regex.
+        if western_lang:
+            # Pad string with whitespace.
+            text = " " + text + " "
+            if lowercase:
+                text = text.lower()
+            for regexp, substitution in self.LANG_DEPENDENT_REGEXES:
+                text = regexp.sub(substitution, text)
+        # Remove contiguous whitespaces.
+        text = " ".join(text.split())
+        # Finally, strips heading and trailing spaces
+        # and converts output string into unicode.
+        text = str(text.strip())
+        return text if return_str else text.split()
+
+    def international_tokenize(
+        self, text, lowercase=False, split_non_ascii=True, return_str=False
+    ):
+        text = str(text)
+        # Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
+        # first before unescaping.
+        regexp, substitution = self.STRIP_SKIP
+        text = regexp.sub(substitution, text)
+        regexp, substitution = self.STRIP_EOL_HYPHEN
+        text = regexp.sub(substitution, text)
+        text = xml_unescape(text)
+
+        if lowercase:
+            text = text.lower()
+
+        for regexp, substitution in self.INTERNATIONAL_REGEXES:
+            text = regexp.sub(substitution, text)
+
+        # Make sure that there's only one space only between words.
+        # Strip leading and trailing spaces.
+        text = " ".join(text.strip().split())
+        return text if return_str else text.split()
--- a/backend/venv/Lib/site-packages/nltk/tokenize/punkt.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/punkt.py
--- a/backend/venv/Lib/site-packages/nltk/tokenize/regexp.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/regexp.py
@@ -0,0 +1,220 @@
+# Natural Language Toolkit: Tokenizers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Trevor Cohn <tacohn@csse.unimelb.edu.au>
+# URL: <https://www.nltk.org>
+# For license information, see LICENSE.TXT
+
+r"""
+Regular-Expression Tokenizers
+
+A ``RegexpTokenizer`` splits a string into substrings using a regular expression.
+For example, the following tokenizer forms tokens out of alphabetic sequences,
+money expressions, and any other non-whitespace sequences:
+
+    >>> from nltk.tokenize import RegexpTokenizer
+    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+    >>> tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
+    >>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE
+    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+
+A ``RegexpTokenizer`` can use its regexp to match delimiters instead:
+
+    >>> tokenizer = RegexpTokenizer(r'\s+', gaps=True)
+    >>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE
+    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
+    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
+
+Note that empty tokens are not returned when the delimiter appears at
+the start or end of the string.
+
+The material between the tokens is discarded.  For example,
+the following tokenizer selects just the capitalized words:
+
+    >>> capword_tokenizer = RegexpTokenizer(r'[A-Z]\w+')
+    >>> capword_tokenizer.tokenize(s)
+    ['Good', 'New', 'York', 'Please', 'Thanks']
+
+This module contains several subclasses of ``RegexpTokenizer``
+that use pre-defined regular expressions.
+
+    >>> from nltk.tokenize import BlanklineTokenizer
+    >>> # Uses '\s*\n\s*\n\s*':
+    >>> BlanklineTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
+    ['Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.',
+    'Thanks.']
+
+All of the regular expression tokenizers are also available as functions:
+
+    >>> from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
+    >>> regexp_tokenize(s, pattern=r'\w+|\$[\d\.]+|\S+') # doctest: +NORMALIZE_WHITESPACE
+    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+    >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
+    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
+     '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+    >>> blankline_tokenize(s)
+    ['Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.', 'Thanks.']
+
+Caution: The function ``regexp_tokenize()`` takes the text as its
+first argument, and the regular expression pattern as its second
+argument.  This differs from the conventions used by Python's
+``re`` functions, where the pattern is always the first argument.
+(This is for consistency with the other NLTK tokenizers.)
+"""
+
+import re
+
+from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import regexp_span_tokenize
+
+
+class RegexpTokenizer(TokenizerI):
+    r"""
+    A tokenizer that splits a string using a regular expression, which
+    matches either the tokens or the separators between tokens.
+
+        >>> tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
+
+    :type pattern: str
+    :param pattern: The pattern used to build this tokenizer.
+        (This pattern must not contain capturing parentheses;
+        Use non-capturing parentheses, e.g. (?:...), instead)
+    :type gaps: bool
+    :param gaps: True if this tokenizer's pattern should be used
+        to find separators between tokens; False if this
+        tokenizer's pattern should be used to find the tokens
+        themselves.
+    :type discard_empty: bool
+    :param discard_empty: True if any empty tokens `''`
+        generated by the tokenizer should be discarded.  Empty
+        tokens can only be generated if `_gaps == True`.
+    :type flags: int
+    :param flags: The regexp flags used to compile this
+        tokenizer's pattern.  By default, the following flags are
+        used: `re.UNICODE | re.MULTILINE | re.DOTALL`.
+
+    """
+
+    def __init__(
+        self,
+        pattern,
+        gaps=False,
+        discard_empty=True,
+        flags=re.UNICODE | re.MULTILINE | re.DOTALL,
+    ):
+        # If they gave us a regexp object, extract the pattern.
+        pattern = getattr(pattern, "pattern", pattern)
+
+        self._pattern = pattern
+        self._gaps = gaps
+        self._discard_empty = discard_empty
+        self._flags = flags
+        self._regexp = None
+
+    def _check_regexp(self):
+        if self._regexp is None:
+            self._regexp = re.compile(self._pattern, self._flags)
+
+    def tokenize(self, text):
+        self._check_regexp()
+        # If our regexp matches gaps, use re.split:
+        if self._gaps:
+            if self._discard_empty:
+                return [tok for tok in self._regexp.split(text) if tok]
+            else:
+                return self._regexp.split(text)
+
+        # If our regexp matches tokens, use re.findall:
+        else:
+            return self._regexp.findall(text)
+
+    def span_tokenize(self, text):
+        self._check_regexp()
+
+        if self._gaps:
+            for left, right in regexp_span_tokenize(text, self._regexp):
+                if not (self._discard_empty and left == right):
+                    yield left, right
+        else:
+            for m in re.finditer(self._regexp, text):
+                yield m.span()
+
+    def __repr__(self):
+        return "{}(pattern={!r}, gaps={!r}, discard_empty={!r}, flags={!r})".format(
+            self.__class__.__name__,
+            self._pattern,
+            self._gaps,
+            self._discard_empty,
+            self._flags,
+        )
+
+
+class WhitespaceTokenizer(RegexpTokenizer):
+    r"""
+    Tokenize a string on whitespace (space, tab, newline).
+    In general, users should use the string ``split()`` method instead.
+
+        >>> from nltk.tokenize import WhitespaceTokenizer
+        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+        >>> WhitespaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
+        ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
+        'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
+    """
+
+    def __init__(self):
+        RegexpTokenizer.__init__(self, r"\s+", gaps=True)
+
+
+class BlanklineTokenizer(RegexpTokenizer):
+    """
+    Tokenize a string, treating any sequence of blank lines as a delimiter.
+    Blank lines are defined as lines containing no characters, except for
+    space or tab characters.
+    """
+
+    def __init__(self):
+        RegexpTokenizer.__init__(self, r"\s*\n\s*\n\s*", gaps=True)
+
+
+class WordPunctTokenizer(RegexpTokenizer):
+    r"""
+    Tokenize a text into a sequence of alphabetic and
+    non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``.
+
+        >>> from nltk.tokenize import WordPunctTokenizer
+        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+        >>> WordPunctTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
+        ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
+        '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+    """
+
+    def __init__(self):
+        RegexpTokenizer.__init__(self, r"\w+|[^\w\s]+")
+
+
+######################################################################
+# { Tokenization Functions
+######################################################################
+
+
+def regexp_tokenize(
+    text,
+    pattern,
+    gaps=False,
+    discard_empty=True,
+    flags=re.UNICODE | re.MULTILINE | re.DOTALL,
+):
+    """
+    Return a tokenized copy of *text*.  See :class:`.RegexpTokenizer`
+    for descriptions of the arguments.
+    """
+    tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags)
+    return tokenizer.tokenize(text)
+
+
+blankline_tokenize = BlanklineTokenizer().tokenize
+wordpunct_tokenize = WordPunctTokenizer().tokenize
--- a/backend/venv/Lib/site-packages/nltk/tokenize/repp.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/repp.py
@@ -0,0 +1,149 @@
+# Natural Language Toolkit: Interface to the Repp Tokenizer
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Authors: Rebecca Dridan and Stephan Oepen
+# Contributors: Liling Tan
+#
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import os
+import re
+import subprocess
+import sys
+import tempfile
+
+from nltk.data import ZipFilePathPointer
+from nltk.internals import find_dir
+from nltk.tokenize.api import TokenizerI
+
+
+class ReppTokenizer(TokenizerI):
+    """
+    A class for word tokenization using the REPP parser described in
+    Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a
+    Long Solved Problem - A Survey, Contrastive  Experiment, Recommendations,
+    and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406
+
+    >>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' ,
+    ... 'But rule-based tokenizers are hard to maintain and their rules language specific.' ,
+    ... 'We evaluated our method on three languages and obtained error rates of 0.27% (English), 0.35% (Dutch) and 0.76% (Italian) for our best models.'
+    ... ]
+    >>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP
+    >>> for sent in sents:                             # doctest: +SKIP
+    ...     tokenizer.tokenize(sent)                   # doctest: +SKIP
+    ...
+    (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
+    (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
+    (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
+
+    >>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP
+    ...     print(sent)                              # doctest: +SKIP
+    ...
+    (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
+    (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
+    (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
+    >>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP
+    ...     print(sent)                                                         # doctest: +SKIP
+    ...
+    [(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)]
+    [(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)]
+    [(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)]
+    """
+
+    def __init__(self, repp_dir, encoding="utf8"):
+        self.repp_dir = self.find_repptokenizer(repp_dir)
+        # Set a directory to store the temporary files.
+        self.working_dir = tempfile.gettempdir()
+        # Set an encoding for the input strings.
+        self.encoding = encoding
+
+    def tokenize(self, sentence):
+        """
+        Use Repp to tokenize a single sentence.
+
+        :param sentence: A single sentence string.
+        :type sentence: str
+        :return: A tuple of tokens.
+        :rtype: tuple(str)
+        """
+        return next(self.tokenize_sents([sentence]))
+
+    def tokenize_sents(self, sentences, keep_token_positions=False):
+        """
+        Tokenize multiple sentences using Repp.
+
+        :param sentences: A list of sentence strings.
+        :type sentences: list(str)
+        :return: A list of tuples of tokens
+        :rtype: iter(tuple(str))
+        """
+        with tempfile.NamedTemporaryFile(
+            prefix="repp_input.", dir=self.working_dir, mode="w", delete=False
+        ) as input_file:
+            # Write sentences to temporary input file.
+            for sent in sentences:
+                input_file.write(str(sent) + "\n")
+            input_file.close()
+            # Generate command to run REPP.
+            cmd = self.generate_repp_command(input_file.name)
+            # Decode the stdout and strips the ending newline.
+            repp_output = self._execute(cmd).decode(self.encoding).strip()
+            for tokenized_sent in self.parse_repp_outputs(repp_output):
+                if not keep_token_positions:
+                    # Removes token position information.
+                    tokenized_sent, starts, ends = zip(*tokenized_sent)
+                yield tokenized_sent
+
+    def generate_repp_command(self, inputfilename):
+        """
+        This module generates the REPP command to be used at the terminal.
+
+        :param inputfilename: path to the input file
+        :type inputfilename: str
+        """
+        cmd = [self.repp_dir + "/src/repp"]
+        cmd += ["-c", self.repp_dir + "/erg/repp.set"]
+        cmd += ["--format", "triple"]
+        cmd += [inputfilename]
+        return cmd
+
+    @staticmethod
+    def _execute(cmd):
+        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stdout, stderr = p.communicate()
+        return stdout
+
+    @staticmethod
+    def parse_repp_outputs(repp_output):
+        """
+        This module parses the tri-tuple format that REPP outputs using the
+        "--format triple" option and returns an generator with tuple of string
+        tokens.
+
+        :param repp_output:
+        :type repp_output: type
+        :return: an iterable of the tokenized sentences as tuples of strings
+        :rtype: iter(tuple)
+        """
+        line_regex = re.compile(r"^\((\d+), (\d+), (.+)\)$", re.MULTILINE)
+        for section in repp_output.split("\n\n"):
+            words_with_positions = [
+                (token, int(start), int(end))
+                for start, end, token in line_regex.findall(section)
+            ]
+            words = tuple(t[2] for t in words_with_positions)
+            yield words_with_positions
+
+    def find_repptokenizer(self, repp_dirname):
+        """
+        A module to find REPP tokenizer binary and its *repp.set* config file.
+        """
+        if os.path.exists(repp_dirname):  # If a full path is given.
+            _repp_dir = repp_dirname
+        else:  # Try to find path to REPP directory in environment variables.
+            _repp_dir = find_dir(repp_dirname, env_vars=("REPP_TOKENIZER",))
+        # Checks for the REPP binary and erg/repp.set config file.
+        assert os.path.exists(_repp_dir + "/src/repp")
+        assert os.path.exists(_repp_dir + "/erg/repp.set")
+        return _repp_dir
--- a/backend/venv/Lib/site-packages/nltk/tokenize/sexpr.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/sexpr.py
@@ -0,0 +1,140 @@
+# Natural Language Toolkit: Tokenizers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Yoav Goldberg <yoavg@cs.bgu.ac.il>
+#         Steven Bird <stevenbird1@gmail.com> (minor edits)
+# URL: <https://www.nltk.org>
+# For license information, see LICENSE.TXT
+
+"""
+S-Expression Tokenizer
+
+``SExprTokenizer`` is used to find parenthesized expressions in a
+string.  In particular, it divides a string into a sequence of
+substrings that are either parenthesized expressions (including any
+nested parenthesized expressions), or other whitespace-separated
+tokens.
+
+    >>> from nltk.tokenize import SExprTokenizer
+    >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
+    ['(a b (c d))', 'e', 'f', '(g)']
+
+By default, `SExprTokenizer` will raise a ``ValueError`` exception if
+used to tokenize an expression with non-matching parentheses:
+
+    >>> SExprTokenizer().tokenize('c) d) e (f (g')
+    Traceback (most recent call last):
+      ...
+    ValueError: Un-matched close paren at char 1
+
+The ``strict`` argument can be set to False to allow for
+non-matching parentheses.  Any unmatched close parentheses will be
+listed as their own s-expression; and the last partial sexpr with
+unmatched open parentheses will be listed as its own sexpr:
+
+    >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
+    ['c', ')', 'd', ')', 'e', '(f (g']
+
+The characters used for open and close parentheses may be customized
+using the ``parens`` argument to the `SExprTokenizer` constructor:
+
+    >>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}')
+    ['{a b {c d}}', 'e', 'f', '{g}']
+
+The s-expression tokenizer is also available as a function:
+
+    >>> from nltk.tokenize import sexpr_tokenize
+    >>> sexpr_tokenize('(a b (c d)) e f (g)')
+    ['(a b (c d))', 'e', 'f', '(g)']
+
+"""
+
+import re
+
+from nltk.tokenize.api import TokenizerI
+
+
+class SExprTokenizer(TokenizerI):
+    """
+    A tokenizer that divides strings into s-expressions.
+    An s-expresion can be either:
+
+      - a parenthesized expression, including any nested parenthesized
+        expressions, or
+      - a sequence of non-whitespace non-parenthesis characters.
+
+    For example, the string ``(a (b c)) d e (f)`` consists of four
+    s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``.
+
+    By default, the characters ``(`` and ``)`` are treated as open and
+    close parentheses, but alternative strings may be specified.
+
+    :param parens: A two-element sequence specifying the open and close parentheses
+        that should be used to find sexprs.  This will typically be either a
+        two-character string, or a list of two strings.
+    :type parens: str or list
+    :param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
+    """
+
+    def __init__(self, parens="()", strict=True):
+        if len(parens) != 2:
+            raise ValueError("parens must contain exactly two strings")
+        self._strict = strict
+        self._open_paren = parens[0]
+        self._close_paren = parens[1]
+        self._paren_regexp = re.compile(
+            f"{re.escape(parens[0])}|{re.escape(parens[1])}"
+        )
+
+    def tokenize(self, text):
+        """
+        Return a list of s-expressions extracted from *text*.
+        For example:
+
+            >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
+            ['(a b (c d))', 'e', 'f', '(g)']
+
+        All parentheses are assumed to mark s-expressions.
+        (No special processing is done to exclude parentheses that occur
+        inside strings, or following backslash characters.)
+
+        If the given expression contains non-matching parentheses,
+        then the behavior of the tokenizer depends on the ``strict``
+        parameter to the constructor.  If ``strict`` is ``True``, then
+        raise a ``ValueError``.  If ``strict`` is ``False``, then any
+        unmatched close parentheses will be listed as their own
+        s-expression; and the last partial s-expression with unmatched open
+        parentheses will be listed as its own s-expression:
+
+            >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
+            ['c', ')', 'd', ')', 'e', '(f (g']
+
+        :param text: the string to be tokenized
+        :type text: str or iter(str)
+        :rtype: iter(str)
+        """
+        result = []
+        pos = 0
+        depth = 0
+        for m in self._paren_regexp.finditer(text):
+            paren = m.group()
+            if depth == 0:
+                result += text[pos : m.start()].split()
+                pos = m.start()
+            if paren == self._open_paren:
+                depth += 1
+            if paren == self._close_paren:
+                if self._strict and depth == 0:
+                    raise ValueError("Un-matched close paren at char %d" % m.start())
+                depth = max(0, depth - 1)
+                if depth == 0:
+                    result.append(text[pos : m.end()])
+                    pos = m.end()
+        if self._strict and depth > 0:
+            raise ValueError("Un-matched open paren at char %d" % pos)
+        if pos < len(text):
+            result.append(text[pos:])
+        return result
+
+
+sexpr_tokenize = SExprTokenizer().tokenize
--- a/backend/venv/Lib/site-packages/nltk/tokenize/simple.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/simple.py
@@ -0,0 +1,139 @@
+# Natural Language Toolkit: Simple Tokenizers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org>
+# For license information, see LICENSE.TXT
+
+r"""
+Simple Tokenizers
+
+These tokenizers divide strings into substrings using the string
+``split()`` method.
+When tokenizing using a particular delimiter string, use
+the string ``split()`` method directly, as this is more efficient.
+
+The simple tokenizers are *not* available as separate functions;
+instead, you should just use the string ``split()`` method directly:
+
+    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+    >>> s.split() # doctest: +NORMALIZE_WHITESPACE
+    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
+    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
+    >>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE
+    ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
+    'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
+    >>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE
+    ['Good muffins cost $3.88', 'in New York.  Please buy me',
+    'two of them.', '', 'Thanks.']
+
+The simple tokenizers are mainly useful because they follow the
+standard ``TokenizerI`` interface, and so can be used with any code
+that expects a tokenizer.  For example, these tokenizers can be used
+to specify the tokenization conventions when building a `CorpusReader`.
+
+"""
+
+from nltk.tokenize.api import StringTokenizer, TokenizerI
+from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize
+
+
+class SpaceTokenizer(StringTokenizer):
+    r"""Tokenize a string using the space character as a delimiter,
+    which is the same as ``s.split(' ')``.
+
+        >>> from nltk.tokenize import SpaceTokenizer
+        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+        >>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
+        ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
+        'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
+    """
+
+    _string = " "
+
+
+class TabTokenizer(StringTokenizer):
+    r"""Tokenize a string use the tab character as a delimiter,
+    the same as ``s.split('\t')``.
+
+        >>> from nltk.tokenize import TabTokenizer
+        >>> TabTokenizer().tokenize('a\tb c\n\t d')
+        ['a', 'b c\n', ' d']
+    """
+
+    _string = "\t"
+
+
+class CharTokenizer(StringTokenizer):
+    """Tokenize a string into individual characters.  If this functionality
+    is ever required directly, use ``for char in string``.
+    """
+
+    _string = None
+
+    def tokenize(self, s):
+        return list(s)
+
+    def span_tokenize(self, s):
+        yield from enumerate(range(1, len(s) + 1))
+
+
+class LineTokenizer(TokenizerI):
+    r"""Tokenize a string into its lines, optionally discarding blank lines.
+    This is similar to ``s.split('\n')``.
+
+        >>> from nltk.tokenize import LineTokenizer
+        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
+        >>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
+        ['Good muffins cost $3.88', 'in New York.  Please buy me',
+        'two of them.', '', 'Thanks.']
+        >>> # same as [l for l in s.split('\n') if l.strip()]:
+        >>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
+        ['Good muffins cost $3.88', 'in New York.  Please buy me',
+        'two of them.', 'Thanks.']
+
+    :param blanklines: Indicates how blank lines should be handled.  Valid values are:
+
+        - ``discard``: strip blank lines out of the token list before returning it.
+           A line is considered blank if it contains only whitespace characters.
+        - ``keep``: leave all blank lines in the token list.
+        - ``discard-eof``: if the string ends with a newline, then do not generate
+           a corresponding token ``''`` after that newline.
+    """
+
+    def __init__(self, blanklines="discard"):
+        valid_blanklines = ("discard", "keep", "discard-eof")
+        if blanklines not in valid_blanklines:
+            raise ValueError(
+                "Blank lines must be one of: %s" % " ".join(valid_blanklines)
+            )
+
+        self._blanklines = blanklines
+
+    def tokenize(self, s):
+        lines = s.splitlines()
+        # If requested, strip off blank lines.
+        if self._blanklines == "discard":
+            lines = [l for l in lines if l.rstrip()]
+        elif self._blanklines == "discard-eof":
+            if lines and not lines[-1].strip():
+                lines.pop()
+        return lines
+
+    # discard-eof not implemented
+    def span_tokenize(self, s):
+        if self._blanklines == "keep":
+            yield from string_span_tokenize(s, r"\n")
+        else:
+            yield from regexp_span_tokenize(s, r"\n(\s+\n)*")
+
+
+######################################################################
+# { Tokenization Functions
+######################################################################
+# XXX: it is stated in module docs that there is no function versions
+
+
+def line_tokenize(text, blanklines="discard"):
+    return LineTokenizer(blanklines).tokenize(text)
--- a/backend/venv/Lib/site-packages/nltk/tokenize/sonority_sequencing.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/sonority_sequencing.py
@@ -0,0 +1,194 @@
+# Natural Language Toolkit: Tokenizers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Christopher Hench <chris.l.hench@gmail.com>
+#         Alex Estes
+# URL: <https://www.nltk.org>
+# For license information, see LICENSE.TXT
+
+"""
+The Sonority Sequencing Principle (SSP) is a language agnostic algorithm proposed
+by Otto Jesperson in 1904. The sonorous quality of a phoneme is judged by the
+openness of the lips. Syllable breaks occur before troughs in sonority. For more
+on the SSP see Selkirk (1984).
+
+The default implementation uses the English alphabet, but the `sonority_hiearchy`
+can be modified to IPA or any other alphabet for the use-case. The SSP is a
+universal syllabification algorithm, but that does not mean it performs equally
+across languages. Bartlett et al. (2009) is a good benchmark for English accuracy
+if utilizing IPA (pg. 311).
+
+Importantly, if a custom hierarchy is supplied and vowels span across more than
+one level, they should be given separately to the `vowels` class attribute.
+
+References:
+
+- Otto Jespersen. 1904. Lehrbuch der Phonetik.
+  Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
+- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
+  In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
+  Cambridge, MIT Press. pp. 107-136.
+- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
+  In HLT-NAACL. pp. 308-316.
+"""
+
+import re
+import warnings
+from string import punctuation
+
+from nltk.tokenize.api import TokenizerI
+from nltk.util import ngrams
+
+
+class SyllableTokenizer(TokenizerI):
+    """
+    Syllabifies words based on the Sonority Sequencing Principle (SSP).
+
+        >>> from nltk.tokenize import SyllableTokenizer
+        >>> from nltk import word_tokenize
+        >>> SSP = SyllableTokenizer()
+        >>> SSP.tokenize('justification')
+        ['jus', 'ti', 'fi', 'ca', 'tion']
+        >>> text = "This is a foobar-like sentence."
+        >>> [SSP.tokenize(token) for token in word_tokenize(text)]
+        [['This'], ['is'], ['a'], ['foo', 'bar', '-', 'li', 'ke'], ['sen', 'ten', 'ce'], ['.']]
+    """
+
+    def __init__(self, lang="en", sonority_hierarchy=False):
+        """
+        :param lang: Language parameter, default is English, 'en'
+        :type lang: str
+        :param sonority_hierarchy: Sonority hierarchy according to the
+                                   Sonority Sequencing Principle.
+        :type sonority_hierarchy: list(str)
+        """
+        # Sonority hierarchy should be provided in descending order.
+        # If vowels are spread across multiple levels, they should be
+        # passed assigned self.vowels var together, otherwise should be
+        # placed in first index of hierarchy.
+        if not sonority_hierarchy and lang == "en":
+            sonority_hierarchy = [
+                "aeiouy",  # vowels.
+                "lmnrw",  # nasals.
+                "zvsf",  # fricatives.
+                "bcdgtkpqxhj",  # stops.
+            ]
+
+        self.vowels = sonority_hierarchy[0]
+        self.phoneme_map = {}
+        for i, level in enumerate(sonority_hierarchy):
+            for c in level:
+                sonority_level = len(sonority_hierarchy) - i
+                self.phoneme_map[c] = sonority_level
+                self.phoneme_map[c.upper()] = sonority_level
+
+    def assign_values(self, token):
+        """
+        Assigns each phoneme its value from the sonority hierarchy.
+        Note: Sentence/text has to be tokenized first.
+
+        :param token: Single word or token
+        :type token: str
+        :return: List of tuples, first element is character/phoneme and
+                 second is the soronity value.
+        :rtype: list(tuple(str, int))
+        """
+        syllables_values = []
+        for c in token:
+            try:
+                syllables_values.append((c, self.phoneme_map[c]))
+            except KeyError:
+                if c not in "0123456789" and c not in punctuation:
+                    warnings.warn(
+                        "Character not defined in sonority_hierarchy,"
+                        " assigning as vowel: '{}'".format(c)
+                    )
+                    syllables_values.append((c, max(self.phoneme_map.values())))
+                    if c not in self.vowels:
+                        self.vowels += c
+                else:  # If it's a punctuation or numbers, assign -1.
+                    syllables_values.append((c, -1))
+        return syllables_values
+
+    def validate_syllables(self, syllable_list):
+        """
+        Ensures each syllable has at least one vowel.
+        If the following syllable doesn't have vowel, add it to the current one.
+
+        :param syllable_list: Single word or token broken up into syllables.
+        :type syllable_list: list(str)
+        :return: Single word or token broken up into syllables
+                 (with added syllables if necessary)
+        :rtype: list(str)
+        """
+        valid_syllables = []
+        front = ""
+        vowel_pattern = re.compile("|".join(self.vowels))
+        for i, syllable in enumerate(syllable_list):
+            if syllable in punctuation:
+                valid_syllables.append(syllable)
+                continue
+            if not vowel_pattern.search(syllable):
+                if len(valid_syllables) == 0:
+                    front += syllable
+                else:
+                    valid_syllables = valid_syllables[:-1] + [
+                        valid_syllables[-1] + syllable
+                    ]
+            else:
+                if len(valid_syllables) == 0:
+                    valid_syllables.append(front + syllable)
+                else:
+                    valid_syllables.append(syllable)
+
+        return valid_syllables
+
+    def tokenize(self, token):
+        """
+        Apply the SSP to return a list of syllables.
+        Note: Sentence/text has to be tokenized first.
+
+        :param token: Single word or token
+        :type token: str
+        :return syllable_list: Single word or token broken up into syllables.
+        :rtype: list(str)
+        """
+        # assign values from hierarchy
+        syllables_values = self.assign_values(token)
+
+        # if only one vowel return word
+        if sum(token.count(x) for x in self.vowels) <= 1:
+            return [token]
+
+        syllable_list = []
+        syllable = syllables_values[0][0]  # start syllable with first phoneme
+        for trigram in ngrams(syllables_values, n=3):
+            phonemes, values = zip(*trigram)
+            # Sonority of previous, focal and following phoneme
+            prev_value, focal_value, next_value = values
+            # Focal phoneme.
+            focal_phoneme = phonemes[1]
+
+            # These cases trigger syllable break.
+            if focal_value == -1:  # If it's a punctuation, just break.
+                syllable_list.append(syllable)
+                syllable_list.append(focal_phoneme)
+                syllable = ""
+            elif prev_value >= focal_value == next_value:
+                syllable += focal_phoneme
+                syllable_list.append(syllable)
+                syllable = ""
+
+            elif prev_value > focal_value < next_value:
+                syllable_list.append(syllable)
+                syllable = ""
+                syllable += focal_phoneme
+
+            # no syllable break
+            else:
+                syllable += focal_phoneme
+
+        syllable += syllables_values[-1][0]  # append last phoneme
+        syllable_list.append(syllable)
+
+        return self.validate_syllables(syllable_list)
--- a/backend/venv/Lib/site-packages/nltk/tokenize/stanford.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/stanford.py
@@ -0,0 +1,115 @@
+# Natural Language Toolkit: Interface to the Stanford Tokenizer
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Xu <xxu@student.unimelb.edu.au>
+#
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import json
+import os
+import tempfile
+import warnings
+from subprocess import PIPE
+
+from nltk.internals import _java_options, config_java, find_jar, java
+from nltk.parse.corenlp import CoreNLPParser
+from nltk.tokenize.api import TokenizerI
+
+_stanford_url = "https://nlp.stanford.edu/software/tokenizer.shtml"
+
+
+class StanfordTokenizer(TokenizerI):
+    r"""
+    Interface to the Stanford Tokenizer
+
+    >>> from nltk.tokenize.stanford import StanfordTokenizer
+    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
+    >>> StanfordTokenizer().tokenize(s) # doctest: +SKIP
+    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+    >>> s = "The colour of the wall is blue."
+    >>> StanfordTokenizer(options={"americanize": True}).tokenize(s) # doctest: +SKIP
+    ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
+    """
+
+    _JAR = "stanford-postagger.jar"
+
+    def __init__(
+        self,
+        path_to_jar=None,
+        encoding="utf8",
+        options=None,
+        verbose=False,
+        java_options="-mx1000m",
+    ):
+        # Raise deprecation warning.
+        warnings.warn(
+            str(
+                "\nThe StanfordTokenizer will "
+                "be deprecated in version 3.2.5.\n"
+                "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.'"
+            ),
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+        self._stanford_jar = find_jar(
+            self._JAR,
+            path_to_jar,
+            env_vars=("STANFORD_POSTAGGER",),
+            searchpath=(),
+            url=_stanford_url,
+            verbose=verbose,
+        )
+
+        self._encoding = encoding
+        self.java_options = java_options
+
+        options = {} if options is None else options
+        self._options_cmd = ",".join(f"{key}={val}" for key, val in options.items())
+
+    @staticmethod
+    def _parse_tokenized_output(s):
+        return s.splitlines()
+
+    def tokenize(self, s):
+        """
+        Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
+        """
+        cmd = ["edu.stanford.nlp.process.PTBTokenizer"]
+        return self._parse_tokenized_output(self._execute(cmd, s))
+
+    def _execute(self, cmd, input_, verbose=False):
+        encoding = self._encoding
+        cmd.extend(["-charset", encoding])
+        _options_cmd = self._options_cmd
+        if _options_cmd:
+            cmd.extend(["-options", self._options_cmd])
+
+        default_options = " ".join(_java_options)
+
+        # Configure java.
+        config_java(options=self.java_options, verbose=verbose)
+
+        # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
+        with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
+            # Write the actual sentences to the temporary input file
+            if isinstance(input_, str) and encoding:
+                input_ = input_.encode(encoding)
+            input_file.write(input_)
+            input_file.flush()
+
+            cmd.append(input_file.name)
+
+            # Run the tagger and get the output.
+            stdout, stderr = java(
+                cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
+            )
+            stdout = stdout.decode(encoding)
+
+        os.unlink(input_file.name)
+
+        # Return java configurations to their default values.
+        config_java(options=default_options, verbose=False)
+
+        return stdout
--- a/backend/venv/Lib/site-packages/nltk/tokenize/stanford_segmenter.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/stanford_segmenter.py
@@ -0,0 +1,292 @@
+#!/usr/bin/env python
+# Natural Language Toolkit: Interface to the Stanford Segmenter
+# for Chinese and Arabic
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: 52nlp <52nlpcn@gmail.com>
+#         Casper Lehmann-Strøm <casperlehmann@gmail.com>
+#         Alex Constantin <alex@keyworder.ch>
+#
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import json
+import os
+import tempfile
+import warnings
+from subprocess import PIPE
+
+from nltk.internals import (
+    _java_options,
+    config_java,
+    find_dir,
+    find_file,
+    find_jar,
+    java,
+)
+from nltk.tokenize.api import TokenizerI
+
+_stanford_url = "https://nlp.stanford.edu/software"
+
+
+class StanfordSegmenter(TokenizerI):
+    """Interface to the Stanford Segmenter
+
+    If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
+    should be provieded, for example::
+
+        seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')
+
+    >>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
+    >>> seg = StanfordSegmenter() # doctest: +SKIP
+    >>> seg.default_config('zh') # doctest: +SKIP
+    >>> sent = u'这是斯坦福中文分词器测试'
+    >>> print(seg.segment(sent)) # doctest: +SKIP
+    \u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5
+    <BLANKLINE>
+    >>> seg.default_config('ar') # doctest: +SKIP
+    >>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
+    >>> print(seg.segment(sent.split())) # doctest: +SKIP
+    \u0647\u0630\u0627 \u0647\u0648 \u062a\u0635\u0646\u064a\u0641 \u0633\u062a\u0627\u0646\u0641\u0648\u0631\u062f \u0627\u0644\u0639\u0631\u0628\u064a \u0644 \u0627\u0644\u0643\u0644\u0645\u0627\u062a
+    <BLANKLINE>
+    """
+
+    _JAR = "stanford-segmenter.jar"
+
+    def __init__(
+        self,
+        path_to_jar=None,
+        path_to_slf4j=None,
+        java_class=None,
+        path_to_model=None,
+        path_to_dict=None,
+        path_to_sihan_corpora_dict=None,
+        sihan_post_processing="false",
+        keep_whitespaces="false",
+        encoding="UTF-8",
+        options=None,
+        verbose=False,
+        java_options="-mx2g",
+    ):
+        # Raise deprecation warning.
+        warnings.simplefilter("always", DeprecationWarning)
+        warnings.warn(
+            str(
+                "\nThe StanfordTokenizer will "
+                "be deprecated in version 3.2.5.\n"
+                "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"
+            ),
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        warnings.simplefilter("ignore", DeprecationWarning)
+
+        stanford_segmenter = find_jar(
+            self._JAR,
+            path_to_jar,
+            env_vars=("STANFORD_SEGMENTER",),
+            searchpath=(),
+            url=_stanford_url,
+            verbose=verbose,
+        )
+        if path_to_slf4j is not None:
+            slf4j = find_jar(
+                "slf4j-api.jar",
+                path_to_slf4j,
+                env_vars=("SLF4J", "STANFORD_SEGMENTER"),
+                searchpath=(),
+                url=_stanford_url,
+                verbose=verbose,
+            )
+        else:
+            slf4j = None
+
+        # This is passed to java as the -cp option, the old version of segmenter needs slf4j.
+        # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
+        self._stanford_jar = os.pathsep.join(
+            _ for _ in [stanford_segmenter, slf4j] if _ is not None
+        )
+
+        self._java_class = java_class
+        self._model = path_to_model
+        self._sihan_corpora_dict = path_to_sihan_corpora_dict
+        self._sihan_post_processing = sihan_post_processing
+        self._keep_whitespaces = keep_whitespaces
+        self._dict = path_to_dict
+
+        self._encoding = encoding
+        self.java_options = java_options
+        options = {} if options is None else options
+        self._options_cmd = ",".join(
+            f"{key}={json.dumps(val)}" for key, val in options.items()
+        )
+
+    def default_config(self, lang):
+        """
+        Attempt to initialize Stanford Word Segmenter for the specified language
+        using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
+        """
+
+        search_path = ()
+        if os.environ.get("STANFORD_SEGMENTER"):
+            search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")}
+
+        # init for Chinese-specific files
+        self._dict = None
+        self._sihan_corpora_dict = None
+        self._sihan_post_processing = "false"
+
+        if lang == "ar":
+            self._java_class = (
+                "edu.stanford.nlp.international.arabic.process.ArabicSegmenter"
+            )
+            model = "arabic-segmenter-atb+bn+arztrain.ser.gz"
+
+        elif lang == "zh":
+            self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier"
+            model = "pku.gz"
+            self._sihan_post_processing = "true"
+
+            path_to_dict = "dict-chris6.ser.gz"
+            try:
+                self._dict = find_file(
+                    path_to_dict,
+                    searchpath=search_path,
+                    url=_stanford_url,
+                    verbose=False,
+                    env_vars=("STANFORD_MODELS",),
+                )
+            except LookupError as e:
+                raise LookupError(
+                    "Could not find '%s' (tried using env. "
+                    "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)"
+                    % path_to_dict
+                ) from e
+
+            sihan_dir = "./data/"
+            try:
+                path_to_sihan_dir = find_dir(
+                    sihan_dir,
+                    url=_stanford_url,
+                    verbose=False,
+                    env_vars=("STANFORD_SEGMENTER",),
+                )
+                self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
+            except LookupError as e:
+                raise LookupError(
+                    "Could not find '%s' (tried using the "
+                    "STANFORD_SEGMENTER environment variable)" % sihan_dir
+                ) from e
+        else:
+            raise LookupError(f"Unsupported language {lang}")
+
+        try:
+            self._model = find_file(
+                model,
+                searchpath=search_path,
+                url=_stanford_url,
+                verbose=False,
+                env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"),
+            )
+        except LookupError as e:
+            raise LookupError(
+                "Could not find '%s' (tried using env. "
+                "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model
+            ) from e
+
+    def tokenize(self, s):
+        super().tokenize(s)
+
+    def segment_file(self, input_file_path):
+        """ """
+        cmd = [
+            self._java_class,
+            "-loadClassifier",
+            self._model,
+            "-keepAllWhitespaces",
+            self._keep_whitespaces,
+            "-textFile",
+            input_file_path,
+        ]
+        if self._sihan_corpora_dict is not None:
+            cmd.extend(
+                [
+                    "-serDictionary",
+                    self._dict,
+                    "-sighanCorporaDict",
+                    self._sihan_corpora_dict,
+                    "-sighanPostProcessing",
+                    self._sihan_post_processing,
+                ]
+            )
+
+        stdout = self._execute(cmd)
+
+        return stdout
+
+    def segment(self, tokens):
+        return self.segment_sents([tokens])
+
+    def segment_sents(self, sentences):
+        """ """
+        encoding = self._encoding
+        # Create a temporary input file
+        _input_fh, self._input_file_path = tempfile.mkstemp(text=True)
+
+        # Write the actural sentences to the temporary input file
+        _input_fh = os.fdopen(_input_fh, "wb")
+        _input = "\n".join(" ".join(x) for x in sentences)
+        if isinstance(_input, str) and encoding:
+            _input = _input.encode(encoding)
+        _input_fh.write(_input)
+        _input_fh.close()
+
+        cmd = [
+            self._java_class,
+            "-loadClassifier",
+            self._model,
+            "-keepAllWhitespaces",
+            self._keep_whitespaces,
+            "-textFile",
+            self._input_file_path,
+        ]
+        if self._sihan_corpora_dict is not None:
+            cmd.extend(
+                [
+                    "-serDictionary",
+                    self._dict,
+                    "-sighanCorporaDict",
+                    self._sihan_corpora_dict,
+                    "-sighanPostProcessing",
+                    self._sihan_post_processing,
+                ]
+            )
+
+        stdout = self._execute(cmd)
+
+        # Delete the temporary file
+        os.unlink(self._input_file_path)
+
+        return stdout
+
+    def _execute(self, cmd, verbose=False):
+        encoding = self._encoding
+        cmd.extend(["-inputEncoding", encoding])
+        _options_cmd = self._options_cmd
+        if _options_cmd:
+            cmd.extend(["-options", self._options_cmd])
+
+        default_options = " ".join(_java_options)
+
+        # Configure java.
+        config_java(options=self.java_options, verbose=verbose)
+
+        stdout, _stderr = java(
+            cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
+        )
+        stdout = stdout.decode(encoding)
+
+        # Return java configurations to their default values.
+        config_java(options=default_options, verbose=False)
+
+        return stdout
--- a/backend/venv/Lib/site-packages/nltk/tokenize/texttiling.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/texttiling.py
@@ -0,0 +1,474 @@
+# Natural Language Toolkit: TextTiling
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: George Boutsioukis
+#
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import math
+import re
+
+try:
+    import numpy
+except ImportError:
+    pass
+
+from nltk.tokenize.api import TokenizerI
+
+BLOCK_COMPARISON, VOCABULARY_INTRODUCTION = 0, 1
+LC, HC = 0, 1
+DEFAULT_SMOOTHING = [0]
+
+
+class TextTilingTokenizer(TokenizerI):
+    """Tokenize a document into topical sections using the TextTiling algorithm.
+    This algorithm detects subtopic shifts based on the analysis of lexical
+    co-occurrence patterns.
+
+    The process starts by tokenizing the text into pseudosentences of
+    a fixed size w. Then, depending on the method used, similarity
+    scores are assigned at sentence gaps. The algorithm proceeds by
+    detecting the peak differences between these scores and marking
+    them as boundaries. The boundaries are normalized to the closest
+    paragraph break and the segmented text is returned.
+
+    :param w: Pseudosentence size
+    :type w: int
+    :param k: Size (in sentences) of the block used in the block comparison method
+    :type k: int
+    :param similarity_method: The method used for determining similarity scores:
+       `BLOCK_COMPARISON` (default) or `VOCABULARY_INTRODUCTION`.
+    :type similarity_method: constant
+    :param stopwords: A list of stopwords that are filtered out (defaults to NLTK's stopwords corpus)
+    :type stopwords: list(str)
+    :param smoothing_method: The method used for smoothing the score plot:
+      `DEFAULT_SMOOTHING` (default)
+    :type smoothing_method: constant
+    :param smoothing_width: The width of the window used by the smoothing method
+    :type smoothing_width: int
+    :param smoothing_rounds: The number of smoothing passes
+    :type smoothing_rounds: int
+    :param cutoff_policy: The policy used to determine the number of boundaries:
+      `HC` (default) or `LC`
+    :type cutoff_policy: constant
+
+    >>> from nltk.corpus import brown
+    >>> tt = TextTilingTokenizer(demo_mode=True)
+    >>> text = brown.raw()[:4000]
+    >>> s, ss, d, b = tt.tokenize(text)
+    >>> b
+    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]
+    """
+
+    def __init__(
+        self,
+        w=20,
+        k=10,
+        similarity_method=BLOCK_COMPARISON,
+        stopwords=None,
+        smoothing_method=DEFAULT_SMOOTHING,
+        smoothing_width=2,
+        smoothing_rounds=1,
+        cutoff_policy=HC,
+        demo_mode=False,
+    ):
+        if stopwords is None:
+            from nltk.corpus import stopwords
+
+            stopwords = stopwords.words("english")
+        self.__dict__.update(locals())
+        del self.__dict__["self"]
+
+    def tokenize(self, text):
+        """Return a tokenized copy of *text*, where each "token" represents
+        a separate topic."""
+
+        lowercase_text = text.lower()
+        paragraph_breaks = self._mark_paragraph_breaks(text)
+        text_length = len(lowercase_text)
+
+        # Tokenization step starts here
+
+        # Remove punctuation
+        nopunct_text = "".join(
+            c for c in lowercase_text if re.match(r"[a-z\-' \n\t]", c)
+        )
+        nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text)
+
+        tokseqs = self._divide_to_tokensequences(nopunct_text)
+
+        # The morphological stemming step mentioned in the TextTile
+        # paper is not implemented.  A comment in the original C
+        # implementation states that it offers no benefit to the
+        # process. It might be interesting to test the existing
+        # stemmers though.
+        # words = _stem_words(words)
+
+        # Filter stopwords
+        for ts in tokseqs:
+            ts.wrdindex_list = [
+                wi for wi in ts.wrdindex_list if wi[0] not in self.stopwords
+            ]
+
+        token_table = self._create_token_table(tokseqs, nopunct_par_breaks)
+        # End of the Tokenization step
+
+        # Lexical score determination
+        if self.similarity_method == BLOCK_COMPARISON:
+            gap_scores = self._block_comparison(tokseqs, token_table)
+        elif self.similarity_method == VOCABULARY_INTRODUCTION:
+            raise NotImplementedError("Vocabulary introduction not implemented")
+        else:
+            raise ValueError(
+                f"Similarity method {self.similarity_method} not recognized"
+            )
+
+        if self.smoothing_method == DEFAULT_SMOOTHING:
+            smooth_scores = self._smooth_scores(gap_scores)
+        else:
+            raise ValueError(f"Smoothing method {self.smoothing_method} not recognized")
+        # End of Lexical score Determination
+
+        # Boundary identification
+        depth_scores = self._depth_scores(smooth_scores)
+        segment_boundaries = self._identify_boundaries(depth_scores)
+
+        normalized_boundaries = self._normalize_boundaries(
+            text, segment_boundaries, paragraph_breaks
+        )
+        # End of Boundary Identification
+        segmented_text = []
+        prevb = 0
+
+        for b in normalized_boundaries:
+            if b == 0:
+                continue
+            segmented_text.append(text[prevb:b])
+            prevb = b
+
+        if prevb < text_length:  # append any text that may be remaining
+            segmented_text.append(text[prevb:])
+
+        if not segmented_text:
+            segmented_text = [text]
+
+        if self.demo_mode:
+            return gap_scores, smooth_scores, depth_scores, segment_boundaries
+        return segmented_text
+
+    def _block_comparison(self, tokseqs, token_table):
+        """Implements the block comparison method"""
+
+        def blk_frq(tok, block):
+            ts_occs = filter(lambda o: o[0] in block, token_table[tok].ts_occurences)
+            freq = sum(tsocc[1] for tsocc in ts_occs)
+            return freq
+
+        gap_scores = []
+        numgaps = len(tokseqs) - 1
+
+        for curr_gap in range(numgaps):
+            score_dividend, score_divisor_b1, score_divisor_b2 = 0.0, 0.0, 0.0
+            score = 0.0
+            # adjust window size for boundary conditions
+            if curr_gap < self.k - 1:
+                window_size = curr_gap + 1
+            elif curr_gap > numgaps - self.k:
+                window_size = numgaps - curr_gap
+            else:
+                window_size = self.k
+
+            b1 = [ts.index for ts in tokseqs[curr_gap - window_size + 1 : curr_gap + 1]]
+            b2 = [ts.index for ts in tokseqs[curr_gap + 1 : curr_gap + window_size + 1]]
+
+            for t in token_table:
+                score_dividend += blk_frq(t, b1) * blk_frq(t, b2)
+                score_divisor_b1 += blk_frq(t, b1) ** 2
+                score_divisor_b2 += blk_frq(t, b2) ** 2
+            try:
+                score = score_dividend / math.sqrt(score_divisor_b1 * score_divisor_b2)
+            except ZeroDivisionError:
+                pass  # score += 0.0
+
+            gap_scores.append(score)
+
+        return gap_scores
+
+    def _smooth_scores(self, gap_scores):
+        "Wraps the smooth function from the SciPy Cookbook"
+        return list(
+            smooth(numpy.array(gap_scores[:]), window_len=self.smoothing_width + 1)
+        )
+
+    def _mark_paragraph_breaks(self, text):
+        """Identifies indented text or line breaks as the beginning of
+        paragraphs"""
+        MIN_PARAGRAPH = 100
+        pattern = re.compile("[ \t\r\f\v]*\n[ \t\r\f\v]*\n[ \t\r\f\v]*")
+        matches = pattern.finditer(text)
+
+        last_break = 0
+        pbreaks = [0]
+        for pb in matches:
+            if pb.start() - last_break < MIN_PARAGRAPH:
+                continue
+            else:
+                pbreaks.append(pb.start())
+                last_break = pb.start()
+
+        return pbreaks
+
+    def _divide_to_tokensequences(self, text):
+        "Divides the text into pseudosentences of fixed size"
+        w = self.w
+        wrdindex_list = []
+        matches = re.finditer(r"\w+", text)
+        for match in matches:
+            wrdindex_list.append((match.group(), match.start()))
+        return [
+            TokenSequence(i / w, wrdindex_list[i : i + w])
+            for i in range(0, len(wrdindex_list), w)
+        ]
+
+    def _create_token_table(self, token_sequences, par_breaks):
+        "Creates a table of TokenTableFields"
+        token_table = {}
+        current_par = 0
+        current_tok_seq = 0
+        pb_iter = par_breaks.__iter__()
+        current_par_break = next(pb_iter)
+        if current_par_break == 0:
+            try:
+                current_par_break = next(pb_iter)  # skip break at 0
+            except StopIteration as e:
+                raise ValueError(
+                    "No paragraph breaks were found(text too short perhaps?)"
+                ) from e
+        for ts in token_sequences:
+            for word, index in ts.wrdindex_list:
+                try:
+                    while index > current_par_break:
+                        current_par_break = next(pb_iter)
+                        current_par += 1
+                except StopIteration:
+                    # hit bottom
+                    pass
+
+                if word in token_table:
+                    token_table[word].total_count += 1
+
+                    if token_table[word].last_par != current_par:
+                        token_table[word].last_par = current_par
+                        token_table[word].par_count += 1
+
+                    if token_table[word].last_tok_seq != current_tok_seq:
+                        token_table[word].last_tok_seq = current_tok_seq
+                        token_table[word].ts_occurences.append([current_tok_seq, 1])
+                    else:
+                        token_table[word].ts_occurences[-1][1] += 1
+                else:  # new word
+                    token_table[word] = TokenTableField(
+                        first_pos=index,
+                        ts_occurences=[[current_tok_seq, 1]],
+                        total_count=1,
+                        par_count=1,
+                        last_par=current_par,
+                        last_tok_seq=current_tok_seq,
+                    )
+
+            current_tok_seq += 1
+
+        return token_table
+
+    def _identify_boundaries(self, depth_scores):
+        """Identifies boundaries at the peaks of similarity score
+        differences"""
+
+        boundaries = [0 for x in depth_scores]
+
+        avg = sum(depth_scores) / len(depth_scores)
+        stdev = numpy.std(depth_scores)
+
+        if self.cutoff_policy == LC:
+            cutoff = avg - stdev
+        else:
+            cutoff = avg - stdev / 2.0
+
+        depth_tuples = sorted(zip(depth_scores, range(len(depth_scores))))
+        depth_tuples.reverse()
+        hp = list(filter(lambda x: x[0] > cutoff, depth_tuples))
+
+        for dt in hp:
+            boundaries[dt[1]] = 1
+            for dt2 in hp:  # undo if there is a boundary close already
+                if (
+                    dt[1] != dt2[1]
+                    and abs(dt2[1] - dt[1]) < 4
+                    and boundaries[dt2[1]] == 1
+                ):
+                    boundaries[dt[1]] = 0
+        return boundaries
+
+    def _depth_scores(self, scores):
+        """Calculates the depth of each gap, i.e. the average difference
+        between the left and right peaks and the gap's score"""
+
+        depth_scores = [0 for x in scores]
+        # clip boundaries: this holds on the rule of thumb(my thumb)
+        # that a section shouldn't be smaller than at least 2
+        # pseudosentences for small texts and around 5 for larger ones.
+
+        clip = min(max(len(scores) // 10, 2), 5)
+        index = clip
+
+        for gapscore in scores[clip:-clip]:
+            lpeak = gapscore
+            for score in scores[index::-1]:
+                if score >= lpeak:
+                    lpeak = score
+                else:
+                    break
+            rpeak = gapscore
+            for score in scores[index:]:
+                if score >= rpeak:
+                    rpeak = score
+                else:
+                    break
+            depth_scores[index] = lpeak + rpeak - 2 * gapscore
+            index += 1
+
+        return depth_scores
+
+    def _normalize_boundaries(self, text, boundaries, paragraph_breaks):
+        """Normalize the boundaries identified to the original text's
+        paragraph breaks"""
+
+        norm_boundaries = []
+        char_count, word_count, gaps_seen = 0, 0, 0
+        seen_word = False
+
+        for char in text:
+            char_count += 1
+            if char in " \t\n" and seen_word:
+                seen_word = False
+                word_count += 1
+            if char not in " \t\n" and not seen_word:
+                seen_word = True
+            if gaps_seen < len(boundaries) and word_count > (
+                max(gaps_seen * self.w, self.w)
+            ):
+                if boundaries[gaps_seen] == 1:
+                    # find closest paragraph break
+                    best_fit = len(text)
+                    for br in paragraph_breaks:
+                        if best_fit > abs(br - char_count):
+                            best_fit = abs(br - char_count)
+                            bestbr = br
+                        else:
+                            break
+                    if bestbr not in norm_boundaries:  # avoid duplicates
+                        norm_boundaries.append(bestbr)
+                gaps_seen += 1
+
+        return norm_boundaries
+
+
+class TokenTableField:
+    """A field in the token table holding parameters for each token,
+    used later in the process"""
+
+    def __init__(
+        self,
+        first_pos,
+        ts_occurences,
+        total_count=1,
+        par_count=1,
+        last_par=0,
+        last_tok_seq=None,
+    ):
+        self.__dict__.update(locals())
+        del self.__dict__["self"]
+
+
+class TokenSequence:
+    "A token list with its original length and its index"
+
+    def __init__(self, index, wrdindex_list, original_length=None):
+        original_length = original_length or len(wrdindex_list)
+        self.__dict__.update(locals())
+        del self.__dict__["self"]
+
+
+# Pasted from the SciPy cookbook: https://www.scipy.org/Cookbook/SignalSmooth
+def smooth(x, window_len=11, window="flat"):
+    """smooth the data using a window with requested size.
+
+    This method is based on the convolution of a scaled window with the signal.
+    The signal is prepared by introducing reflected copies of the signal
+    (with the window size) in both ends so that transient parts are minimized
+    in the beginning and end part of the output signal.
+
+    :param x: the input signal
+    :param window_len: the dimension of the smoothing window; should be an odd integer
+    :param window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'
+        flat window will produce a moving average smoothing.
+
+    :return: the smoothed signal
+
+    example::
+
+        t=linspace(-2,2,0.1)
+        x=sin(t)+randn(len(t))*0.1
+        y=smooth(x)
+
+    :see also: numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve,
+        scipy.signal.lfilter
+
+    TODO: the window parameter could be the window itself if an array instead of a string
+    """
+
+    if x.ndim != 1:
+        raise ValueError("smooth only accepts 1 dimension arrays.")
+
+    if x.size < window_len:
+        raise ValueError("Input vector needs to be bigger than window size.")
+
+    if window_len < 3:
+        return x
+
+    if window not in ["flat", "hanning", "hamming", "bartlett", "blackman"]:
+        raise ValueError(
+            "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
+        )
+
+    s = numpy.r_[2 * x[0] - x[window_len:1:-1], x, 2 * x[-1] - x[-1:-window_len:-1]]
+
+    # print(len(s))
+    if window == "flat":  # moving average
+        w = numpy.ones(window_len, "d")
+    else:
+        w = eval("numpy." + window + "(window_len)")
+
+    y = numpy.convolve(w / w.sum(), s, mode="same")
+
+    return y[window_len - 1 : -window_len + 1]
+
+
+def demo(text=None):
+    from matplotlib import pylab
+
+    from nltk.corpus import brown
+
+    tt = TextTilingTokenizer(demo_mode=True)
+    if text is None:
+        text = brown.raw()[:10000]
+    s, ss, d, b = tt.tokenize(text)
+    pylab.xlabel("Sentence Gap index")
+    pylab.ylabel("Gap Scores")
+    pylab.plot(range(len(s)), s, label="Gap Scores")
+    pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
+    pylab.plot(range(len(d)), d, label="Depth scores")
+    pylab.stem(range(len(b)), b)
+    pylab.legend()
+    pylab.show()
--- a/backend/venv/Lib/site-packages/nltk/tokenize/toktok.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/toktok.py
@@ -0,0 +1,180 @@
+# Natural Language Toolkit: Python port of the tok-tok.pl tokenizer.
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Jon Dehdari
+# Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters,
+# Alex Rudnick
+#
+# URL: <https://www.nltk.org>
+# For license information, see LICENSE.TXT
+
+"""
+The tok-tok tokenizer is a simple, general tokenizer, where the input has one
+sentence per line; thus only final period is tokenized.
+
+Tok-tok has been tested on, and gives reasonably good results for English,
+Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
+The input should be in UTF-8 encoding.
+
+Reference:
+Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
+Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
+"""
+
+import re
+
+from nltk.tokenize.api import TokenizerI
+
+
+class ToktokTokenizer(TokenizerI):
+    """
+    This is a Python port of the tok-tok.pl from
+    https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl
+
+    >>> toktok = ToktokTokenizer()
+    >>> text = u'Is 9.5 or 525,600 my favorite number?'
+    >>> print(toktok.tokenize(text, return_str=True))
+    Is 9.5 or 525,600 my favorite number ?
+    >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
+    >>> print(toktok.tokenize(text, return_str=True))
+    The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
+    >>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
+    >>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+    >>> assert toktok.tokenize(text, return_str=True) == expected
+    >>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
+    True
+    """
+
+    # Replace non-breaking spaces with normal spaces.
+    NON_BREAKING = re.compile("\u00A0"), " "
+
+    # Pad some funky punctuation.
+    FUNKY_PUNCT_1 = re.compile(r'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 "
+    # Pad more funky punctuation.
+    FUNKY_PUNCT_2 = re.compile(r"([({\[“‘„‚«‹「『])"), r" \1 "
+    # Pad En dash and em dash
+    EN_EM_DASHES = re.compile("([–—])"), r" \1 "
+
+    # Replace problematic character with numeric character reference.
+    AMPERCENT = re.compile("& "), "&amp; "
+    TAB = re.compile("\t"), " &#9; "
+    PIPE = re.compile(r"\|"), " &#124; "
+
+    # Pad numbers with commas to keep them from further tokenization.
+    COMMA_IN_NUM = re.compile(r"(?<!,)([,،])(?![,\d])"), r" \1 "
+
+    # Just pad problematic (often neurotic) hyphen/single quote, etc.
+    PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r" \1 "
+    # Group ` ` stupid quotes ' ' into a single token.
+    STUPID_QUOTES_1 = re.compile(r" ` ` "), r" `` "
+    STUPID_QUOTES_2 = re.compile(r" ' ' "), r" '' "
+
+    # Don't tokenize period unless it ends the line and that it isn't
+    # preceded by another period, e.g.
+    # "something ..." -> "something ..."
+    # "something." -> "something ."
+    FINAL_PERIOD_1 = re.compile(r"(?<!\.)\.$"), r" ."
+    # Don't tokenize period unless it ends the line eg.
+    # " ... stuff." ->  "... stuff ."
+    FINAL_PERIOD_2 = re.compile(r"""(?<!\.)\.\s*(["'’»›”]) *$"""), r" . \1"
+
+    # Treat continuous commas as fake German,Czech, etc.: „
+    MULTI_COMMAS = re.compile(r"(,{2,})"), r" \1 "
+    # Treat continuous dashes as fake en-dash, etc.
+    MULTI_DASHES = re.compile(r"(-{2,})"), r" \1 "
+    # Treat multiple periods as a thing (eg. ellipsis)
+    MULTI_DOTS = re.compile(r"(\.{2,})"), r" \1 "
+
+    # This is the \p{Open_Punctuation} from Perl's perluniprops
+    # see https://perldoc.perl.org/perluniprops.html
+    OPEN_PUNCT = str(
+        "([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d"
+        "\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772"
+        "\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983"
+        "\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993"
+        "\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26"
+        "\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016"
+        "\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39"
+        "\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b"
+        "\ufe5d\uff08\uff3b\uff5b\uff5f\uff62"
+    )
+    # This is the \p{Close_Punctuation} from Perl's perluniprops
+    CLOSE_PUNCT = str(
+        ")]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a"
+        "\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6"
+        "\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988"
+        "\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998"
+        "\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009"
+        "\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b"
+        "\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c"
+        "\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e"
+        "\uff09\uff3d\uff5d\uff60\uff63"
+    )
+    # This is the \p{Close_Punctuation} from Perl's perluniprops
+    CURRENCY_SYM = str(
+        "$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb"
+        "\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3"
+        "\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab"
+        "\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3"
+        "\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838"
+        "\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6"
+    )
+
+    # Pad spaces after opening punctuations.
+    OPEN_PUNCT_RE = re.compile(f"([{OPEN_PUNCT}])"), r"\1 "
+    # Pad spaces before closing punctuations.
+    CLOSE_PUNCT_RE = re.compile(f"([{CLOSE_PUNCT}])"), r"\1 "
+    # Pad spaces after currency symbols.
+    CURRENCY_SYM_RE = re.compile(f"([{CURRENCY_SYM}])"), r"\1 "
+
+    # Use for tokenizing URL-unfriendly characters: [:/?#]
+    URL_FOE_1 = re.compile(r":(?!//)"), r" : "  # in perl s{:(?!//)}{ : }g;
+    URL_FOE_2 = re.compile(r"\?(?!\S)"), r" ? "  # in perl s{\?(?!\S)}{ ? }g;
+    # in perl: m{://} or m{\S+\.\S+/\S+} or s{/}{ / }g;
+    URL_FOE_3 = re.compile(r"(:\/\/)[\S+\.\S+\/\S+][\/]"), " / "
+    URL_FOE_4 = re.compile(r" /"), r" / "  # s{ /}{ / }g;
+
+    # Left/Right strip, i.e. remove heading/trailing spaces.
+    # These strip regexes should NOT be used,
+    # instead use str.lstrip(), str.rstrip() or str.strip()
+    # (They are kept for reference purposes to the original toktok.pl code)
+    LSTRIP = re.compile(r"^ +"), ""
+    RSTRIP = re.compile(r"\s+$"), "\n"
+    # Merge multiple spaces.
+    ONE_SPACE = re.compile(r" {2,}"), " "
+
+    TOKTOK_REGEXES = [
+        NON_BREAKING,
+        FUNKY_PUNCT_1,
+        FUNKY_PUNCT_2,
+        URL_FOE_1,
+        URL_FOE_2,
+        URL_FOE_3,
+        URL_FOE_4,
+        AMPERCENT,
+        TAB,
+        PIPE,
+        OPEN_PUNCT_RE,
+        CLOSE_PUNCT_RE,
+        MULTI_COMMAS,
+        COMMA_IN_NUM,
+        PROB_SINGLE_QUOTES,
+        STUPID_QUOTES_1,
+        STUPID_QUOTES_2,
+        CURRENCY_SYM_RE,
+        EN_EM_DASHES,
+        MULTI_DASHES,
+        MULTI_DOTS,
+        FINAL_PERIOD_1,
+        FINAL_PERIOD_2,
+        ONE_SPACE,
+    ]
+
+    def tokenize(self, text, return_str=False):
+        text = str(text)  # Converts input string into unicode.
+        for regexp, substitution in self.TOKTOK_REGEXES:
+            text = regexp.sub(substitution, text)
+        # Finally, strips heading and trailing spaces
+        # and converts output string into unicode.
+        text = str(text.strip())
+        return text if return_str else text.split()
--- a/backend/venv/Lib/site-packages/nltk/tokenize/treebank.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/treebank.py
@@ -0,0 +1,402 @@
+# Natural Language Toolkit: Tokenizers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
+#         Tom Aarsen <> (modifications)
+#
+# URL: <https://www.nltk.org>
+# For license information, see LICENSE.TXT
+
+r"""
+
+Penn Treebank Tokenizer
+
+The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
+This implementation is a port of the tokenizer sed script written by Robert McIntyre
+and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
+"""
+
+import re
+import warnings
+from typing import Iterator, List, Tuple
+
+from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.destructive import MacIntyreContractions
+from nltk.tokenize.util import align_tokens
+
+
+class TreebankWordTokenizer(TokenizerI):
+    r"""
+    The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
+
+    This tokenizer performs the following steps:
+
+    - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
+    - treat most punctuation characters as separate tokens
+    - split off commas and single quotes, when followed by whitespace
+    - separate periods that appear at the end of line
+
+    >>> from nltk.tokenize import TreebankWordTokenizer
+    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
+    >>> TreebankWordTokenizer().tokenize(s)
+    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
+    >>> s = "They'll save and invest more."
+    >>> TreebankWordTokenizer().tokenize(s)
+    ['They', "'ll", 'save', 'and', 'invest', 'more', '.']
+    >>> s = "hi, my name can't hello,"
+    >>> TreebankWordTokenizer().tokenize(s)
+    ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
+    """
+
+    # starting quotes
+    STARTING_QUOTES = [
+        (re.compile(r"^\""), r"``"),
+        (re.compile(r"(``)"), r" \1 "),
+        (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
+    ]
+
+    # punctuation
+    PUNCTUATION = [
+        (re.compile(r"([:,])([^\d])"), r" \1 \2"),
+        (re.compile(r"([:,])$"), r" \1 "),
+        (re.compile(r"\.\.\."), r" ... "),
+        (re.compile(r"[;@#$%&]"), r" \g<0> "),
+        (
+            re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
+            r"\1 \2\3 ",
+        ),  # Handles the final period.
+        (re.compile(r"[?!]"), r" \g<0> "),
+        (re.compile(r"([^'])' "), r"\1 ' "),
+    ]
+
+    # Pads parentheses
+    PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
+
+    # Optionally: Convert parentheses, brackets and converts them to PTB symbols.
+    CONVERT_PARENTHESES = [
+        (re.compile(r"\("), "-LRB-"),
+        (re.compile(r"\)"), "-RRB-"),
+        (re.compile(r"\["), "-LSB-"),
+        (re.compile(r"\]"), "-RSB-"),
+        (re.compile(r"\{"), "-LCB-"),
+        (re.compile(r"\}"), "-RCB-"),
+    ]
+
+    DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
+
+    # ending quotes
+    ENDING_QUOTES = [
+        (re.compile(r"''"), " '' "),
+        (re.compile(r'"'), " '' "),
+        (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
+        (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
+    ]
+
+    # List of contractions adapted from Robert MacIntyre's tokenizer.
+    _contractions = MacIntyreContractions()
+    CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
+    CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
+
+    def tokenize(
+        self, text: str, convert_parentheses: bool = False, return_str: bool = False
+    ) -> List[str]:
+        r"""Return a tokenized copy of `text`.
+
+        >>> from nltk.tokenize import TreebankWordTokenizer
+        >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
+        >>> TreebankWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
+        ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
+        'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
+        'of', 'them.', 'Thanks', '.']
+        >>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
+        ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
+        'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
+        'of', 'them.', 'Thanks', '.']
+
+        :param text: A string with a sentence or sentences.
+        :type text: str
+        :param convert_parentheses: if True, replace parentheses to PTB symbols,
+            e.g. `(` to `-LRB-`. Defaults to False.
+        :type convert_parentheses: bool, optional
+        :param return_str: If True, return tokens as space-separated string,
+            defaults to False.
+        :type return_str: bool, optional
+        :return: List of tokens from `text`.
+        :rtype: List[str]
+        """
+        if return_str is not False:
+            warnings.warn(
+                "Parameter 'return_str' has been deprecated and should no "
+                "longer be used.",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+
+        for regexp, substitution in self.STARTING_QUOTES:
+            text = regexp.sub(substitution, text)
+
+        for regexp, substitution in self.PUNCTUATION:
+            text = regexp.sub(substitution, text)
+
+        # Handles parentheses.
+        regexp, substitution = self.PARENS_BRACKETS
+        text = regexp.sub(substitution, text)
+        # Optionally convert parentheses
+        if convert_parentheses:
+            for regexp, substitution in self.CONVERT_PARENTHESES:
+                text = regexp.sub(substitution, text)
+
+        # Handles double dash.
+        regexp, substitution = self.DOUBLE_DASHES
+        text = regexp.sub(substitution, text)
+
+        # add extra space to make things easier
+        text = " " + text + " "
+
+        for regexp, substitution in self.ENDING_QUOTES:
+            text = regexp.sub(substitution, text)
+
+        for regexp in self.CONTRACTIONS2:
+            text = regexp.sub(r" \1 \2 ", text)
+        for regexp in self.CONTRACTIONS3:
+            text = regexp.sub(r" \1 \2 ", text)
+
+        # We are not using CONTRACTIONS4 since
+        # they are also commented out in the SED scripts
+        # for regexp in self._contractions.CONTRACTIONS4:
+        #     text = regexp.sub(r' \1 \2 \3 ', text)
+
+        return text.split()
+
+    def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
+        r"""
+        Returns the spans of the tokens in ``text``.
+        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
+
+            >>> from nltk.tokenize import TreebankWordTokenizer
+            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
+            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
+            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
+            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
+            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
+            >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
+            True
+            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
+            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
+            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
+            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
+            True
+
+        :param text: A string with a sentence or sentences.
+        :type text: str
+        :yield: Tuple[int, int]
+        """
+        raw_tokens = self.tokenize(text)
+
+        # Convert converted quotes back to original double quotes
+        # Do this only if original text contains double quote(s) or double
+        # single-quotes (because '' might be transformed to `` if it is
+        # treated as starting quotes).
+        if ('"' in text) or ("''" in text):
+            # Find double quotes and converted quotes
+            matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
+
+            # Replace converted quotes back to double quotes
+            tokens = [
+                matched.pop(0) if tok in ['"', "``", "''"] else tok
+                for tok in raw_tokens
+            ]
+        else:
+            tokens = raw_tokens
+
+        yield from align_tokens(tokens, text)
+
+
+class TreebankWordDetokenizer(TokenizerI):
+    r"""
+    The Treebank detokenizer uses the reverse regex operations corresponding to
+    the Treebank tokenizer's regexes.
+
+    Note:
+
+    - There're additional assumption mades when undoing the padding of ``[;@#$%&]``
+      punctuation symbols that isn't presupposed in the TreebankTokenizer.
+    - There're additional regexes added in reversing the parentheses tokenization,
+       such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right
+       padding added to the closing parentheses precedding ``[:;,.]``.
+    - It's not possible to return the original whitespaces as they were because
+      there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at
+      the text.split() operation.
+
+    >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
+    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
+    >>> d = TreebankWordDetokenizer()
+    >>> t = TreebankWordTokenizer()
+    >>> toks = t.tokenize(s)
+    >>> d.detokenize(toks)
+    'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'
+
+    The MXPOST parentheses substitution can be undone using the ``convert_parentheses``
+    parameter:
+
+    >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
+    >>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
+    ... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
+    ... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
+    >>> expected_tokens == t.tokenize(s, convert_parentheses=True)
+    True
+    >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
+    >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
+    True
+
+    During tokenization it's safe to add more spaces but during detokenization,
+    simply undoing the padding doesn't really help.
+
+    - During tokenization, left and right pad is added to ``[!?]``, when
+      detokenizing, only left shift the ``[!?]`` is needed.
+      Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``.
+
+    - During tokenization ``[:,]`` are left and right padded but when detokenizing,
+      only left shift is necessary and we keep right pad after comma/colon
+      if the string after is a non-digit.
+      Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``.
+
+    >>> from nltk.tokenize.treebank import TreebankWordDetokenizer
+    >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
+    >>> twd = TreebankWordDetokenizer()
+    >>> twd.detokenize(toks)
+    "hello, i can't feel my feet! Help!!"
+
+    >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
+    ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
+    >>> twd.detokenize(toks)
+    "hello, i can't feel; my feet! Help!! He said: Help, help?!"
+    """
+
+    _contractions = MacIntyreContractions()
+    CONTRACTIONS2 = [
+        re.compile(pattern.replace("(?#X)", r"\s"))
+        for pattern in _contractions.CONTRACTIONS2
+    ]
+    CONTRACTIONS3 = [
+        re.compile(pattern.replace("(?#X)", r"\s"))
+        for pattern in _contractions.CONTRACTIONS3
+    ]
+
+    # ending quotes
+    ENDING_QUOTES = [
+        (re.compile(r"([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1\2 "),
+        (re.compile(r"([^' ])\s('[sS]|'[mM]|'[dD]|') "), r"\1\2 "),
+        (re.compile(r"(\S)\s(\'\')"), r"\1\2"),
+        (
+            re.compile(r"(\'\')\s([.,:)\]>};%])"),
+            r"\1\2",
+        ),  # Quotes followed by no-left-padded punctuations.
+        (re.compile(r"''"), '"'),
+    ]
+
+    # Handles double dashes
+    DOUBLE_DASHES = (re.compile(r" -- "), r"--")
+
+    # Optionally: Convert parentheses, brackets and converts them from PTB symbols.
+    CONVERT_PARENTHESES = [
+        (re.compile("-LRB-"), "("),
+        (re.compile("-RRB-"), ")"),
+        (re.compile("-LSB-"), "["),
+        (re.compile("-RSB-"), "]"),
+        (re.compile("-LCB-"), "{"),
+        (re.compile("-RCB-"), "}"),
+    ]
+
+    # Undo padding on parentheses.
+    PARENS_BRACKETS = [
+        (re.compile(r"([\[\(\{\<])\s"), r"\g<1>"),
+        (re.compile(r"\s([\]\)\}\>])"), r"\g<1>"),
+        (re.compile(r"([\]\)\}\>])\s([:;,.])"), r"\1\2"),
+    ]
+
+    # punctuation
+    PUNCTUATION = [
+        (re.compile(r"([^'])\s'\s"), r"\1' "),
+        (re.compile(r"\s([?!])"), r"\g<1>"),  # Strip left pad for [?!]
+        # (re.compile(r'\s([?!])\s'), r'\g<1>'),
+        (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r"\1\2\3"),
+        # When tokenizing, [;@#$%&] are padded with whitespace regardless of
+        # whether there are spaces before or after them.
+        # But during detokenization, we need to distinguish between left/right
+        # pad, so we split this up.
+        (re.compile(r"([#$])\s"), r"\g<1>"),  # Left pad.
+        (re.compile(r"\s([;%])"), r"\g<1>"),  # Right pad.
+        # (re.compile(r"\s([&*])\s"), r" \g<1> "),  # Unknown pad.
+        (re.compile(r"\s\.\.\.\s"), r"..."),
+        # (re.compile(r"\s([:,])\s$"), r"\1"),  # .strip() takes care of it.
+        (
+            re.compile(r"\s([:,])"),
+            r"\1",
+        ),  # Just remove left padding. Punctuation in numbers won't be padded.
+    ]
+
+    # starting quotes
+    STARTING_QUOTES = [
+        (re.compile(r"([ (\[{<])\s``"), r"\1``"),
+        (re.compile(r"(``)\s"), r"\1"),
+        (re.compile(r"``"), r'"'),
+    ]
+
+    def tokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str:
+        """
+        Treebank detokenizer, created by undoing the regexes from
+        the TreebankWordTokenizer.tokenize.
+
+        :param tokens: A list of strings, i.e. tokenized text.
+        :type tokens: List[str]
+        :param convert_parentheses: if True, replace PTB symbols with parentheses,
+            e.g. `-LRB-` to `(`. Defaults to False.
+        :type convert_parentheses: bool, optional
+        :return: str
+        """
+        text = " ".join(tokens)
+
+        # Add extra space to make things easier
+        text = " " + text + " "
+
+        # Reverse the contractions regexes.
+        # Note: CONTRACTIONS4 are not used in tokenization.
+        for regexp in self.CONTRACTIONS3:
+            text = regexp.sub(r"\1\2", text)
+        for regexp in self.CONTRACTIONS2:
+            text = regexp.sub(r"\1\2", text)
+
+        # Reverse the regexes applied for ending quotes.
+        for regexp, substitution in self.ENDING_QUOTES:
+            text = regexp.sub(substitution, text)
+
+        # Undo the space padding.
+        text = text.strip()
+
+        # Reverse the padding on double dashes.
+        regexp, substitution = self.DOUBLE_DASHES
+        text = regexp.sub(substitution, text)
+
+        if convert_parentheses:
+            for regexp, substitution in self.CONVERT_PARENTHESES:
+                text = regexp.sub(substitution, text)
+
+        # Reverse the padding regexes applied for parenthesis/brackets.
+        for regexp, substitution in self.PARENS_BRACKETS:
+            text = regexp.sub(substitution, text)
+
+        # Reverse the regexes applied for punctuations.
+        for regexp, substitution in self.PUNCTUATION:
+            text = regexp.sub(substitution, text)
+
+        # Reverse the regexes applied for starting quotes.
+        for regexp, substitution in self.STARTING_QUOTES:
+            text = regexp.sub(substitution, text)
+
+        return text.strip()
+
+    def detokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str:
+        """Duck-typing the abstract *tokenize()*."""
+        return self.tokenize(tokens, convert_parentheses)
--- a/backend/venv/Lib/site-packages/nltk/tokenize/util.py
+++ b/backend/venv/Lib/site-packages/nltk/tokenize/util.py
@@ -0,0 +1,295 @@
+# Natural Language Toolkit: Tokenizer Utilities
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org>
+# For license information, see LICENSE.TXT
+
+from re import finditer
+from xml.sax.saxutils import escape, unescape
+
+
+def string_span_tokenize(s, sep):
+    r"""
+    Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
+    tuples, by splitting the string at each occurrence of *sep*.
+
+        >>> from nltk.tokenize.util import string_span_tokenize
+        >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
+        ... two of them.\n\nThanks.'''
+        >>> list(string_span_tokenize(s, " ")) # doctest: +NORMALIZE_WHITESPACE
+        [(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37),
+        (38, 44), (45, 48), (49, 55), (56, 58), (59, 73)]
+
+    :param s: the string to be tokenized
+    :type s: str
+    :param sep: the token separator
+    :type sep: str
+    :rtype: iter(tuple(int, int))
+    """
+    if len(sep) == 0:
+        raise ValueError("Token delimiter must not be empty")
+    left = 0
+    while True:
+        try:
+            right = s.index(sep, left)
+            if right != 0:
+                yield left, right
+        except ValueError:
+            if left != len(s):
+                yield left, len(s)
+            break
+
+        left = right + len(sep)
+
+
+def regexp_span_tokenize(s, regexp):
+    r"""
+    Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
+    tuples, by splitting the string at each successive match of *regexp*.
+
+        >>> from nltk.tokenize.util import regexp_span_tokenize
+        >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
+        ... two of them.\n\nThanks.'''
+        >>> list(regexp_span_tokenize(s, r'\s')) # doctest: +NORMALIZE_WHITESPACE
+        [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36),
+        (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
+
+    :param s: the string to be tokenized
+    :type s: str
+    :param regexp: regular expression that matches token separators (must not be empty)
+    :type regexp: str
+    :rtype: iter(tuple(int, int))
+    """
+    left = 0
+    for m in finditer(regexp, s):
+        right, next = m.span()
+        if right != left:
+            yield left, right
+        left = next
+    yield left, len(s)
+
+
+def spans_to_relative(spans):
+    r"""
+    Return a sequence of relative spans, given a sequence of spans.
+
+        >>> from nltk.tokenize import WhitespaceTokenizer
+        >>> from nltk.tokenize.util import spans_to_relative
+        >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
+        ... two of them.\n\nThanks.'''
+        >>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s))) # doctest: +NORMALIZE_WHITESPACE
+        [(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6),
+        (1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)]
+
+    :param spans: a sequence of (start, end) offsets of the tokens
+    :type spans: iter(tuple(int, int))
+    :rtype: iter(tuple(int, int))
+    """
+    prev = 0
+    for left, right in spans:
+        yield left - prev, right - left
+        prev = right
+
+
+class CJKChars:
+    """
+    An object that enumerates the code points of the CJK characters as listed on
+    https://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
+
+    This is a Python port of the CJK code point enumerations of Moses tokenizer:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309
+    """
+
+    # Hangul Jamo (1100–11FF)
+    Hangul_Jamo = (4352, 4607)  # (ord(u"\u1100"), ord(u"\u11ff"))
+
+    # CJK Radicals Supplement (2E80–2EFF)
+    # Kangxi Radicals (2F00–2FDF)
+    # Ideographic Description Characters (2FF0–2FFF)
+    # CJK Symbols and Punctuation (3000–303F)
+    # Hiragana (3040–309F)
+    # Katakana (30A0–30FF)
+    # Bopomofo (3100–312F)
+    # Hangul Compatibility Jamo (3130–318F)
+    # Kanbun (3190–319F)
+    # Bopomofo Extended (31A0–31BF)
+    # CJK Strokes (31C0–31EF)
+    # Katakana Phonetic Extensions (31F0–31FF)
+    # Enclosed CJK Letters and Months (3200–32FF)
+    # CJK Compatibility (3300–33FF)
+    # CJK Unified Ideographs Extension A (3400–4DBF)
+    # Yijing Hexagram Symbols (4DC0–4DFF)
+    # CJK Unified Ideographs (4E00–9FFF)
+    # Yi Syllables (A000–A48F)
+    # Yi Radicals (A490–A4CF)
+    CJK_Radicals = (11904, 42191)  # (ord(u"\u2e80"), ord(u"\ua4cf"))
+
+    # Phags-pa (A840–A87F)
+    Phags_Pa = (43072, 43135)  # (ord(u"\ua840"), ord(u"\ua87f"))
+
+    # Hangul Syllables (AC00–D7AF)
+    Hangul_Syllables = (44032, 55215)  # (ord(u"\uAC00"), ord(u"\uD7AF"))
+
+    # CJK Compatibility Ideographs (F900–FAFF)
+    CJK_Compatibility_Ideographs = (63744, 64255)  # (ord(u"\uF900"), ord(u"\uFAFF"))
+
+    # CJK Compatibility Forms (FE30–FE4F)
+    CJK_Compatibility_Forms = (65072, 65103)  # (ord(u"\uFE30"), ord(u"\uFE4F"))
+
+    # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
+    Katakana_Hangul_Halfwidth = (65381, 65500)  # (ord(u"\uFF65"), ord(u"\uFFDC"))
+
+    # Supplementary Ideographic Plane 20000–2FFFF
+    Supplementary_Ideographic_Plane = (
+        131072,
+        196607,
+    )  # (ord(u"\U00020000"), ord(u"\U0002FFFF"))
+
+    ranges = [
+        Hangul_Jamo,
+        CJK_Radicals,
+        Phags_Pa,
+        Hangul_Syllables,
+        CJK_Compatibility_Ideographs,
+        CJK_Compatibility_Forms,
+        Katakana_Hangul_Halfwidth,
+        Supplementary_Ideographic_Plane,
+    ]
+
+
+def is_cjk(character):
+    """
+    Python port of Moses' code to check for CJK character.
+
+    >>> CJKChars().ranges
+    [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)]
+    >>> is_cjk(u'\u33fe')
+    True
+    >>> is_cjk(u'\uFE5F')
+    False
+
+    :param character: The character that needs to be checked.
+    :type character: char
+    :return: bool
+    """
+    return any(
+        [
+            start <= ord(character) <= end
+            for start, end in [
+                (4352, 4607),
+                (11904, 42191),
+                (43072, 43135),
+                (44032, 55215),
+                (63744, 64255),
+                (65072, 65103),
+                (65381, 65500),
+                (131072, 196607),
+            ]
+        ]
+    )
+
+
+def xml_escape(text):
+    """
+    This function transforms the input text into an "escaped" version suitable
+    for well-formed XML formatting.
+
+    Note that the default xml.sax.saxutils.escape() function don't escape
+    some characters that Moses does so we have to manually add them to the
+    entities dictionary.
+
+        >>> input_str = ''')| & < > ' " ] ['''
+        >>> expected_output =  ''')| &amp; &lt; &gt; ' " ] ['''
+        >>> escape(input_str) == expected_output
+        True
+        >>> xml_escape(input_str)
+        ')&#124; &amp; &lt; &gt; &apos; &quot; &#93; &#91;'
+
+    :param text: The text that needs to be escaped.
+    :type text: str
+    :rtype: str
+    """
+    return escape(
+        text,
+        entities={
+            r"'": r"&apos;",
+            r'"': r"&quot;",
+            r"|": r"&#124;",
+            r"[": r"&#91;",
+            r"]": r"&#93;",
+        },
+    )
+
+
+def xml_unescape(text):
+    """
+    This function transforms the "escaped" version suitable
+    for well-formed XML formatting into humanly-readable string.
+
+    Note that the default xml.sax.saxutils.unescape() function don't unescape
+    some characters that Moses does so we have to manually add them to the
+    entities dictionary.
+
+        >>> from xml.sax.saxutils import unescape
+        >>> s = ')&#124; &amp; &lt; &gt; &apos; &quot; &#93; &#91;'
+        >>> expected = ''')| & < > \' " ] ['''
+        >>> xml_unescape(s) == expected
+        True
+
+    :param text: The text that needs to be unescaped.
+    :type text: str
+    :rtype: str
+    """
+    return unescape(
+        text,
+        entities={
+            r"&apos;": r"'",
+            r"&quot;": r'"',
+            r"&#124;": r"|",
+            r"&#91;": r"[",
+            r"&#93;": r"]",
+        },
+    )
+
+
+def align_tokens(tokens, sentence):
+    """
+    This module attempt to find the offsets of the tokens in *s*, as a sequence
+    of ``(start, end)`` tuples, given the tokens and also the source string.
+
+        >>> from nltk.tokenize import TreebankWordTokenizer
+        >>> from nltk.tokenize.util import align_tokens
+        >>> s = str("The plane, bound for St Petersburg, crashed in Egypt's "
+        ... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh "
+        ... "on Saturday.")
+        >>> tokens = TreebankWordTokenizer().tokenize(s)
+        >>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23),
+        ... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54),
+        ... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89),
+        ... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122),
+        ... (123, 131), (131, 132)]
+        >>> output = list(align_tokens(tokens, s))
+        >>> len(tokens) == len(expected) == len(output)  # Check that length of tokens and tuples are the same.
+        True
+        >>> expected == list(align_tokens(tokens, s))  # Check that the output is as expected.
+        True
+        >>> tokens == [s[start:end] for start, end in output]  # Check that the slices of the string corresponds to the tokens.
+        True
+
+    :param tokens: The list of strings that are the result of tokenization
+    :type tokens: list(str)
+    :param sentence: The original string
+    :type sentence: str
+    :rtype: list(tuple(int,int))
+    """
+    point = 0
+    offsets = []
+    for token in tokens:
+        try:
+            start = sentence.index(token, point)
+        except ValueError as e:
+            raise ValueError(f'substring "{token}" not found in "{sentence}"') from e
+        point = start + len(token)
+        offsets.append((start, point))
+    return offsets